@@ -18,7 +18,65 @@ extern size_t Stringrchr(char *str,char ch, size_t stride,size_t len);
1818extern  size_t  Stringrchr2 (unsigned short  * str , unsigned short  ch , size_t  stride ,size_t  len );
1919extern  size_t  Stringrchr4 (unsigned int   * str , unsigned int   ch , size_t  stride ,size_t  len );
2020
21- #if  defined(__SSE2__ ) ||  EMU_AVX 
21+ #if  C_AVX2  ||  EMU_AVX2 
22+ 
23+ static  size_t  srchr (char *  str , char  ch , size_t  len ){
24+  size_t  i = len ;
25+  // align to 32 bytes 
26+  while  ((i > 0 ) &&  ((((intptr_t )str + i ) &  31 ) !=  0 )){if  (ch != str [i - 1 ]) return  i ; else  -- i ;}
27+  if (!i ) return  0 ;
28+ /* don't test i>=0 which is always true because size_t is unsigned */ 
29+  const  __m256i  xmm0  =  _mm256_set1_epi8 ( ch  );
30+  const  __m256i  xmm2  =  _mm256_set1_epi8 ( 0xff  );
31+  while  (i  >  32 ) {
32+   // search for ch 
33+   int  mask  =  0 ;
34+    __m256i  xmm1  =  _mm256_load_si256 ((__m256i  * )(str + i - 32 ));
35+    xmm1  =  _mm256_andnot_si256 (_mm256_cmpeq_epi8 (xmm1 , xmm0 ),xmm2 );
36+    if  ((mask  =  _mm256_movemask_epi8 (xmm1 )) !=  0 ) {   // some character is not ch 
37+     // got 0 somewhere within 32 bytes in xmm1, or within 32 bits in mask 
38+     // find index of last set bit 
39+ #if  (MMSC_VER )   // make sure <intrin.h> is included 
40+     unsigned long  pos ;
41+     _BitScanBackward (& pos , mask );
42+     i  -=  (size_t )pos ;
43+ #elif  defined(__clang__ ) ||  ((__GNUC__  >= 4 ) ||  ((__GNUC__  ==  3 ) &&  (__GNUC_MINOR__  >= 4 ))) // modern GCC has built-in __builtin_ctz 
44+     i  -=  __builtin_clz (mask );
45+ #else   // none of choices exist, use local BSR implementation 
46+ #error  __builtin_clz
47+ #endif 
48+     return  i ;
49+   }
50+   i  -=  32 ;
51+  }
52+  while  (i  >  16 ) {
53+   const  __m128i  xmm0  =  _mm_set1_epi8 ( ch  );
54+   const  __m128i  xmm2  =  _mm_set1_epi8 ( 0xff  );
55+   // search for ch 
56+   int  mask  =  0 ;
57+    __m128i  xmm1  =  _mm_load_si128 ((__m128i  * )(str + i - 16 ));
58+    xmm1  =  _mm_andnot_si128 (_mm_cmpeq_epi8 (xmm1 , xmm0 ),xmm2 );
59+    if  ((mask  =  _mm_movemask_epi8 (xmm1 )) !=  0 ) {   // some character is not ch 
60+     // got 0 somewhere within 16 bytes in xmm1, or within 16 bits in mask 
61+     // find index of last set bit 
62+ #if  (MMSC_VER )   // make sure <intrin.h> is included 
63+     unsigned long  pos ;
64+     _BitScanBackward (& pos , mask );
65+     i  -=  (size_t )pos - 16 ;
66+ #elif  defined(__clang__ ) ||  ((__GNUC__  >= 4 ) ||  ((__GNUC__  ==  3 ) &&  (__GNUC_MINOR__  >= 4 ))) // modern GCC has built-in __builtin_ctz 
67+     i  -=  __builtin_clz (mask )- 16 ;  // mask is 32-bits but only lower 16-bits are significant 
68+ #else   // none of choices exist, use local BSR implementation 
69+ #error  __builtin_clz
70+ #endif 
71+     return  i ;
72+   }
73+   i  -=  16 ;
74+  }
75+ 
76+  while  (i > 0 ){if  (ch != str [i - 1 ]) return  i ; else  -- i ;}
77+  return  0 ;
78+ }
79+ #elif  defined(__SSE2__ ) ||  EMU_AVX 
2280
2381static  size_t  srchr (char *  str , char  ch , size_t  len ){
2482 size_t  i = len ;
0 commit comments