-
Notifications
You must be signed in to change notification settings - Fork 3
/
bsalign.h
4052 lines (3952 loc) · 135 KB
/
bsalign.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* bsalign.h includes 'pairwise edit alignment' and 'pairwise gap-weighting alignment'
*
* Jue Ruan <[email protected]>
*
* References:
* Myers,G. 1999. "A fast bit-vector algorithm for approximate string matching based on dynamic programming." J. ACM, 46, 395¨C41.
* Farrar, Michael. 2007. "Striped Smith-Waterman Speeds Database Searches Six Times over Other SIMD Implementations." Bioinformatics 23 (2): 156¨C61.
* Suzuki, Hajime, and Masahiro Kasahara. 2017. "Acceleration of Nucleotide Semi-Global Alignment with Adaptive Banded Dynamic Programming" BioRxiv, September, 130633.
* Suzuki, Hajime, and Masahiro Kasahara. 2018. "Introducing Difference Recurrence Relations for Faster Semi-Global Alignment of Long Sequences." BMC Bioinformatics 19 (Suppl 1).
* Li, Heng. 2018. "Minimap2: Pairwise Alignment for Nucleotide Sequences." Bioinformatics 34 (18): 3094¨C3100.
*
* To use bsalign.h in your program, please copy bsalign.h, list.h, sort.h and mem_share.h together
*
*/
#ifndef BAND_STRIPED_DNA_SEQ_ALIGNMENT_RJ_H
#define BAND_STRIPED_DNA_SEQ_ALIGNMENT_RJ_H
#include "list.h"
#include "sort.h"
#ifdef __AVX2__
#include <immintrin.h>
#else
#include <emmintrin.h>
#include <smmintrin.h>
#include <tmmintrin.h>
#endif
#define SEQALIGN_MODE_GLOBAL 0
#define SEQALIGN_MODE_OVERLAP 1
#define SEQALIGN_MODE_EXTEND 2
#define SEQALIGN_MODE_KMER 3
#define SEQALIGN_MODEMASK_TYPE 0x3
#define SEQALIGN_MODE_QPROF 4
#define SEQALIGN_MODE_MEMRESV 8
#define SEQALIGN_MODE_CIGRESV 16
#define seqalign_mode_type(mode) ((mode) & SEQALIGN_MODEMASK_TYPE)
#define SEQALIGN_BT_M 0
#define SEQALIGN_BT_I 1
#define SEQALIGN_BT_D 2
#define SEQALIGN_BT1_IE 4
#define SEQALIGN_BT1_DE 8
#define SEQALIGN_BT2_I1 1
#define SEQALIGN_BT2_D1 2
#define SEQALIGN_BT2_I2 3
#define SEQALIGN_BT2_D2 4
#define SEQALIGN_BT2_IE1 8
#define SEQALIGN_BT2_DE1 16
#define SEQALIGN_BT2_IE2 32
#define SEQALIGN_BT2_DE2 64
#define SEQALIGN_SCORE_EPI8_MIN (-(MAX_B1 >> 1))
#define SEQALIGN_SCORE_EPI8_MAX (MAX_B1 >> 1)
#define SEQALIGN_SCORE_MIN (-(MAX_B4 >> 2))
#define SEQALIGN_SCORE_MAX (MAX_B4 >> 2)
#define SEQALIGN_CIGAR_M 0
#define SEQALIGN_CIGAR_I 1
#define SEQALIGN_CIGAR_D 2
#define SEQALIGN_CIGAR_N 3
#define SEQALIGN_CIGAR_S 4
#define SEQALIGN_CIGAR_H 5
#define SEQALIGN_CIGAR_P 6
#define SEQALIGN_CIGAR_E 7
#define SEQALIGN_CIGAR_X 8
#ifdef __AVX2__
//#pragma message("Choose AVX2 in " __FILE__ ". Just a message, ignore it")
#define WORDSIZE 32
#define WORDSHIFT 5
typedef __m256i xint;
#define mm_load _mm256_load_si256
#define mm_loadu _mm256_loadu_si256
#define mm_store _mm256_store_si256
#define mm_storeu _mm256_storeu_si256
#define mm_or _mm256_or_si256
#define mm_xor _mm256_xor_si256
#define mm_and _mm256_and_si256
#define mm_andnot _mm256_andnot_si256
#define mm_setzero _mm256_setzero_si256
#define mm_set1_epi8 _mm256_set1_epi8
#define mm_set1_epi16 _mm256_set1_epi16
#define mm_set1_epi32 _mm256_set1_epi32
#define mm_set1_epi64x _mm256_set1_epi64x
#define mm_srli _mm256_srli_si256
#define mm_slli _mm256_slli_si256
#define mm_srli_epi32 _mm256_srli_epi32
#define mm_srai_epi32 _mm256_srai_epi32
#define mm_slli_epi32 _mm256_slli_epi32
#define mm_slai_epi32 _mm256_slai_epi32
#define mm_srli_epi64 _mm256_srli_epi64
#define mm_srai_epi64 _mm256_srai_epi64
#define mm_slli_epi64 _mm256_slli_epi64
#define mm_slai_epi64 _mm256_slai_epi64
#define mm_insert_epi8 _mm256_insert_epi8
#define mm_extract_epi16 _mm256_extract_epi16
#define mm_extract_epi32 _mm256_extract_epi32
#define mm_adds_epi8 _mm256_adds_epi8
#define mm_adds_epu8 _mm256_adds_epu8
#define mm_adds_epi16 _mm256_adds_epi16
#define mm_add_epi32 _mm256_add_epi32
#define mm_subs_epi8 _mm256_subs_epi8
#define mm_subs_epu8 _mm256_subs_epu8
#define mm_subs_epi16 _mm256_subs_epi16
#define mm_sub_epi32 _mm256_sub_epi32
#define mm_cmpeq_epi8 _mm256_cmpeq_epi8
#define mm_cmpgt_epi8 _mm256_cmpgt_epi8
#define mm_cmpeq_epi16 _mm256_cmpeq_epi16
#define mm_cmpgt_epi16 _mm256_cmpgt_epi16
#define mm_cmpeq_epi32 _mm256_cmpeq_epi32
#define mm_cmpgt_epi32 _mm256_cmpgt_epi32
#define mm_movemask_epi8 _mm256_movemask_epi8
#define mm_max_epi8 _mm256_max_epi8
#define mm_min_epi8 _mm256_min_epi8
#define mm_max_epu8 _mm256_max_epu8
#define mm_min_epu8 _mm256_min_epu8
#define mm_max_epi16 _mm256_max_epi16
#define mm_min_epi16 _mm256_min_epi16
#define mm_max_epi32 _mm256_max_epi32
#define mm_min_epi32 _mm256_min_epi32
#define mm_shuffle _mm256_shuffle_epi8
#define mm_blendv _mm256_blendv_epi8
#define mm_cvtepi8x0_epi16(a) _mm256_cvtepi8_epi16(_mm256_castsi256_si128(a))
#define mm_cvtepi8x1_epi16(a) _mm256_cvtepi8_epi16(_mm256_castsi256_si128(_mm256_srli_si256(a, 16)))
#define mm_cvtepi16x0_epi32(a) _mm256_cvtepi16_epi32(_mm256_castsi256_si128(a))
#define mm_cvtepi16x1_epi32(a) _mm256_cvtepi16_epi32(_mm256_castsi256_si128(_mm256_srli_si256(a, 16)))
#define mm_cvtepi8x0_epi32(a) _mm256_cvtepi8_epi32(_mm256_castsi256_si128(a))
#define mm_cvtepi8x1_epi32(a) _mm256_cvtepi8_epi16(_mm256_castsi256_si128(_mm256_srli_si256(a, 8)))
#define mm_cvtepi8x2_epi32(a) _mm256_cvtepi8_epi16(_mm256_castsi256_si128(_mm256_srli_si256(a, 16)))
#define mm_cvtepi8x3_epi32(a) _mm256_cvtepi8_epi16(_mm256_castsi256_si128(_mm256_srli_si256(a, 24)))
#define mm_packs_epi32 _mm256_packs_epi32
#define mm_packs_epi16 _mm256_packs_epi16
#else
//#pragma message("Choose SSE4.2 in " __FILE__ ". Just a message, ignore it")
#define WORDSIZE 16
#define WORDSHIFT 4
typedef __m128i xint;
#define mm_load _mm_load_si128
#define mm_loadu _mm_loadu_si128
#define mm_store _mm_store_si128
#define mm_storeu _mm_storeu_si128
#define mm_or _mm_or_si128
#define mm_xor _mm_xor_si128
#define mm_and _mm_and_si128
#define mm_andnot _mm_andnot_si128
#define mm_setzero _mm_setzero_si128
#define mm_set1_epi8 _mm_set1_epi8
#define mm_set1_epi16 _mm_set1_epi16
#define mm_set1_epi32 _mm_set1_epi32
#define mm_set1_epi64x _mm_set1_epi64x
#define mm_srli _mm_srli_si128
#define mm_slli _mm_slli_si128
#define mm_srli_epi32 _mm_srli_epi32
#define mm_srai_epi32 _mm_srai_epi32
#define mm_slli_epi32 _mm_slli_epi32
#define mm_slai_epi32 _mm_slai_epi32
#define mm_srli_epi64 _mm_srli_epi64
#define mm_srai_epi64 _mm_srai_epi64
#define mm_slli_epi64 _mm_slli_epi64
#define mm_slai_epi64 _mm_slai_epi64
#define mm_insert_epi8 _mm_insert_epi8
#define mm_extract_epi16 _mm_extract_epi16
#define mm_extract_epi32 _mm_extract_epi32
#define mm_adds_epi8 _mm_adds_epi8
#define mm_adds_epu8 _mm_adds_epu8
#define mm_adds_epi16 _mm_adds_epi16
#define mm_add_epi32 _mm_add_epi32
#define mm_subs_epi8 _mm_subs_epi8
#define mm_subs_epu8 _mm_subs_epu8
#define mm_subs_epi16 _mm_subs_epi16
#define mm_sub_epi32 _mm_sub_epi32
#define mm_cmpeq_epi8 _mm_cmpeq_epi8
#define mm_cmpgt_epi8 _mm_cmpgt_epi8
#define mm_cmpeq_epi16 _mm_cmpeq_epi16
#define mm_cmpgt_epi16 _mm_cmpgt_epi16
#define mm_cmpeq_epi32 _mm_cmpeq_epi32
#define mm_cmpgt_epi32 _mm_cmpgt_epi32
#define mm_movemask_epi8 _mm_movemask_epi8
#define mm_max_epi8 _mm_max_epi8
#define mm_min_epi8 _mm_min_epi8
#define mm_max_epu8 _mm_max_epu8
#define mm_min_epu8 _mm_min_epu8
#define mm_max_epi16 _mm_max_epi16
#define mm_min_epi16 _mm_min_epi16
#define mm_max_epi32 _mm_max_epi32
#define mm_min_epi32 _mm_min_epi32
#define mm_shuffle _mm_shuffle_epi8
#define mm_blendv _mm_blendv_epi8
#define mm_cvtepi8x0_epi16(a) _mm_cvtepi8_epi16(a)
#define mm_cvtepi8x1_epi16(a) _mm_cvtepi8_epi16(_mm_srli_si128(a, 8))
#define mm_cvtepi16x0_epi32(a) _mm_cvtepi16_epi32(a)
#define mm_cvtepi16x1_epi32(a) _mm_cvtepi16_epi32(_mm_srli_si128(a, 8))
#define mm_cvtepi8x0_epi32(a) _mm_cvtepi8_epi32(a)
#define mm_cvtepi8x1_epi32(a) _mm_cvtepi8_epi32(_mm_srli_si128(a, 4))
#define mm_cvtepi8x2_epi32(a) _mm_cvtepi8_epi32(_mm_srli_si128(a, 8))
#define mm_cvtepi8x3_epi32(a) _mm_cvtepi8_epi32(_mm_srli_si128(a, 12))
#define mm_packs_epi32 _mm_packs_epi32
#define mm_packs_epi16 _mm_packs_epi16
#endif
#define mm_sr1bit(x) mm_or(mm_slli_epi64(mm_srli(x, 8), 63), mm_srli_epi64(x, 1))
#define mm_sl1bit(x) mm_or(mm_srli_epi64(mm_slli(x, 8), 63), mm_slli_epi64(x, 1))
typedef struct {
int score;
int qb, qe;
int tb, te;
int mat, mis, ins, del, aln;
} seqalign_result_t;
/**
* Basic function referings for DNA alignment using edit distance, there are implementations in this file
*/
#define striped_seqedit_getval(xs, W, pos) (((xs)[(pos) % (W)] >> ((pos) / (W))) & 0x1)
#define striped_seqedit_qprof_size(qlen, bandwidth) ((num_max(qlen, bandwidth) + 1) * 4 * 8)
static inline void striped_seqedit_set_query_prof(u1i *qseq, u4i qlen, u4i bandwidth, u8i *qprof);
static inline void striped_seqedit_row_init(u8i *us[2], u4i W);
static inline void striped_seqedit_row_movx(u8i *us[2][2], u4i W, u4i movx, int mode, int *heading_score);
static inline void striped_seqedit_row_cal(u4i rbeg, u8i *us[2][2], u8i *hs, u8i *qprof, int mode, u4i W, u1i base);
static inline seqalign_result_t striped_seqedit_backtrace(u8i *uts[2], u4i *begs, u4i W, u1i *qseq, int qend, u1i *tseq, int tend, int mode, u4v *cigars);
// mode: SEQALIGN_MODE_GLOBAL | SEQALIGN_MODE_OVERLAP (full query but can be partial target)
static inline seqalign_result_t striped_seqedit_pairwise(u1i *qseq, u4i qlen, u1i *tseq, u4i tlen, int mode, u4i bandwidth, b1v *mempool, u4v *cigars, int verbose);
static inline seqalign_result_t kmer_striped_seqedit_pairwise(u1i ksz, u1i *qseq, u4i qlen, u1i *tseq, u4i tlen, b1v *mempool, u4v *cigars, int verbose);
#define striped_epi2_seqedit_getval(xs, W, pos) (((xs)[((((pos) % (W)) * WORDSIZE) + (((pos) / (W)) >> 3))] >> (((((pos) / ((W)))) & 0x7))) & 0x1)
#define striped_epi2_seqedit_qprof_size(qlen) (roundup_times(qlen, WORDSIZE * 8) / 2)
static inline void striped_epi2_seqedit_set_query_prof(u1i *qseq, u4i qlen, b1i *qprof);
static inline void striped_epi2_seqedit_row_init(b1i *us[2], u4i W, int mode);
static inline void striped_epi2_seqedit_row_cal(u4i rbeg, b1i *us[2][2], b1i *hs, b1i *qprof, u4i W, u1i base, int mode);
static inline seqalign_result_t striped_epi2_seqedit_backtrace(b1i *uts[2], u4i W, int mode, u1i *qseq, int qend, u1i *tseq, int tend, u4v *cigars);
static inline seqalign_result_t striped_epi2_seqedit_pairwise(u1i *qseq, u4i qlen, u1i *tseq, u4i tlen, b1v *mempool, u4v *cigars, int mode, int verbose);
// only support short query (<=128*127=16256)
// edit-overlap mode
//typedef void (*striped_epi2_seqedit_row_merge_func)(b1i *us[3][2], u2i *hs[2], u1i W);
/**
*
* My Algorithm = global/overlap alignment + striped vectorization + difference recurrence relation + adaptive banding + active F-loop
*
* I compute the score matrix in the way of one row by one row, that is y always increases 1 in next call.
* x is resorted into striped vector based on W and B.
* B: number of values in a SIMD word, W = bandwidth / B.
* When B = 4 and W = 2, there have
* normal array [0, 1, 2, 3, 4, 5, 6, 7]
* striped blocks [0, 2, 4, 6; 1, 3, 5, 7]
* running blocks [0, 1; 2, 3; 4, 5; 6, 7]
* In implementation, I use xint to store 16 int8_t, B = 16.
* H, E, F, Q, G are the absolute scores
* u, e, f, q, g are the relative scores
* S is score for bases matching
*
* Important formulas:
* H(x, y) = max(H(x - 1, y - 1) + S(x, y), E(x, y), Q(x, y), F(x, y), G(x, y))
* u(x, y) = H(x, y) - H(x - 1, y)
* h(x, y) = H(x, y) - H(x - 1, y - 1)
* # vertical cross two rows
* E(x, y) = max(E(x, y - 1) + gap_e, H(x, y - 1) + gap_o + gap_e)
* e(x, y) = E(x, y) - H(x, y)
* Q(x, y) = max(Q(x, y - 1) + gap_e2, H(x, y - 1) + gap_o2 + gap_e2)
* q(x, y) = Q(x, y) - H(x, y)
* # horizontal along a row
* F(x, y) = max(F(x - 1, y) + gap_e, H(x - 1, y) + gap_o + gap_e)
* f(x, y) = F(x, y) - H(x - 1, y - 1)
* G(x, y) = max(G(x - 1, y) + gap_e2, H(x - 1, y) + gap_o2 + gap_e2)
* g(x, y) = G(x, y) - H(x - 1, y - 1)
*
* Main steps
* 1, shifting
* shuffling the previous row into be well aligned with current row, all in striped. See banded_striped_epi8_seqalign_piece2_row_mov
*
* 2, first pass of current row to obtain f(x, y) and g(x, y) of the last striped block
* S(x, y) can be load from prepared striped query profile
* load e and q from previous row
* f(0, y) = g(0, y) = MIN_INF
* h(x, y) = max(S(x, y), e(x, y), q(x, y), f(x, y), g(x, y))
* f(x + 1, y) = max(f(x, y) + gap_e, h(x, y) + gap_o + gap_e) + u(x, y - 1) // please note u(x, y - 1) = H(x, y - 1) - H(x - 1, y - 1)
* g(x + 1, y) = max(g(x, y) + gap_e2, h(x, y) + gap_o2 + gap_e2) + u(x, y - 1)
* sum_u[] = sum(u([], y - 1)) // summing u of each running block in previous row
*
* 3, active F-loop
* F was calculated within each running block, need to check whether F can update Hs/Fs in next one or more running blocks, call this F-penetration
* Imagining a bigger F that continous updates all following Hs/Fs to the very ending, traditional lazy F-loop algorithm will perform badly
* When doing global alignment, such like cases often happen. Active F-loop first updates all Fs at the very beggining of each running block,
* then there must be at most W loops instead of 16 * W.
* f[0] = MIN_INF // f[0] = {f(0, y), f(W, y), f(2 * W, y), ..., f((B - 1) * W, y)} in striped coordinate, f(0 ... B - 1, y) in normal coordinate
* f[i] = max(f[i], f[i - 1] + sum_u[i] + 16 * gap_e)
* g[0] = MIN_INF
* g[i] = max(g[i], g[i - 1] + sum_u[i] + 16 * gap_e2)
* Now, there will be no F-penetration
*
* 4, second pass of row to calculate scores
* ... // calculate h(x, y), and update e(x, y + 1), q(x, y + 1), f(x + 1, y), and g(x + 1, y)
* u(x, y) = h(x, y) - (h(x - 1, y) - u(x - 1, y)) // update u in the loop of striped blocks
*
* 5, find max score within a row
* H(x, y) = H(x - 1, y) + u(x, y)
*
* 6, adaptive band
* if sum(H[0 .. W - 1]) > sum(H[15 * W .. 16 * W - 1]) row_offset = row_offset + 0 // in normal coordinate
* if sum(H[0 .. W - 1]) == sum(H[15 * W .. 16 * W - 1]) row_offset = row_offset + 1
* if sum(H[0 .. W - 1]) < sum(H[15 * W .. 16 * W - 1]) row_offset = row_offset + 2
* keep the bandwidth, but shift the offset of band in this row for next call, see 1) shifting
*
*/
/**
* Basic function referings for global/extend/overlap DNA sequence alignment, there are implementations in this file
*/
#define banded_striped_epi8_pos2idx(bandwidth, pos) ((((pos) % (bandwidth >> WORDSHIFT)) << WORDSHIFT) + ((pos) / ((bandwidth) >> WORDSHIFT)))
static inline void banded_striped_epi8_seqalign_set_score_matrix(b1i matrix[16], b1i mat, b1i mis){ u4i i; for(i=0;i<16;i++) matrix[i] = ((i ^ (i >> 2)) & 0x3)? mis : mat; }
#define banded_striped_epi8_seqalign_qprof_size(qlen, bandwidth) (((num_max(qlen, bandwidth) + 1) * 4) * WORDSIZE)
#define banded_striped_epi8_seqalign_get_qprof_value(qprof, pos, base) (qprof)[((pos) * 4 + (base)) * WORDSIZE]
// prepare query profile
static inline void banded_striped_epi8_seqalign_set_query_prof_native(u1i *qseq, u4i qlen, b1i *qprof, u4i bandwidth, b1i mtx[16]);
static inline void banded_striped_epi8_seqalign_set_query_prof(u1i *qseq, u4i qlen, b1i *qprof, u4i bandwidth, b1i mtx[16]);
static inline void banded_striped_epi8_seqalign_set_query_prof_hpc(u1i *qseq, u4i qlen, b1i *qprof, u4i bandwidth, b1i mtx[16], b1i bonus);
// prepare the rows[-1]
// mode = SEQALIGN_MODE_GLOBAL/SEQALIGN_MODE_EXTEND/SEQALIGN_MODE_OVERLAP
// length of ubegs = WORDSIZE + 1
static void banded_striped_epi8_seqalign_piecex_row_init(b1i *us, b1i *es, b1i *qs, int *ubegs, int *rbeg, int mode, u4i bandwidth, b1i max_nt, b1i min_nt, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2);
// W = bandwidth / WORDSIZE, WORDSIZE = 16 (SSEx 128) or 32 (AVX2 512)
// converting from [1] to [0]
// make the two row aligned
// ubegs is the absolute scores for the first striped block
// movx can be any value
static inline void banded_striped_epi8_seqalign_piecex_row_movx(b1i *us[2], b1i *es[2], b1i *qs[2], int *ubegs[2], u4i W, u4i movx, int piecewise, b1i nt_max, b1i nt_min, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2);
// mov MUST <= W
static inline void banded_striped_epi8_seqalign_piecex_row_mov(b1i *us[2], b1i *es[2], b1i *qs[2], int *ubegs[2], u4i W, u4i mov, int piecewise, b1i nt_min, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2);
// core func to update row scores, from us[0] to us[1]
// rh is the score of H(-1, y - 1), 0, or SCORE_MIN, please see banded_striped_epi8_seqalign_pairwise
// ubegs[i] - ubegs[i-1] is used in F-penetration
// ubegs will be updated
// @return: score of H(-1, y), note that H(-1, y) is useful to restore all scores of row in row_max
static inline int banded_striped_epi8_seqalign_piece0_row_btcal(u4i rbeg, u1i base, b1i *us[2], b1i *es[2], b1i *qs[2], b1i *bs, int *ubegs[2], b1i *qprof, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2, u4i W, u4i mov, int rh);
static inline int banded_striped_epi8_seqalign_piece1_row_btcal(u4i rbeg, u1i base, b1i *us[2], b1i *es[2], b1i *qs[2], b1i *bs, int *ubegs[2], b1i *qprof, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2, u4i W, u4i mov, int rh);
static inline int banded_striped_epi8_seqalign_piece2_row_btcal(u4i rbeg, u1i base, b1i *us[2], b1i *es[2], b1i *qs[2], b1i *bs, int *ubegs[2], b1i *qprof, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2, u4i W, u4i mov, int rh);
static inline int banded_striped_epi8_seqalign_piece0_row_cal(u4i rbeg, u1i base, b1i *us[2], b1i *es[2], b1i *qs[2], int *ubegs[2], b1i *qprof, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2, u4i W, u4i mov, int rh);
static inline int banded_striped_epi8_seqalign_piece1_row_cal(u4i rbeg, u1i base, b1i *us[2], b1i *es[2], b1i *qs[2], int *ubegs[2], b1i *qprof, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2, u4i W, u4i mov, int rh);
static inline int banded_striped_epi8_seqalign_piece2_row_cal(u4i rbeg, u1i base, b1i *us[2], b1i *es[2], b1i *qs[2], int *ubegs[2], b1i *qprof, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2, u4i W, u4i mov, int rh);
static inline int banded_striped_epi8_seqalign_piecex_row_cal(u4i rbeg, u1i base, b1i *us[2], b1i *es[2], b1i *qs[2], int *ubegs[2], b1i *qprof, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2, u4i W, u4i mov, int rh, int piecewise);
// cmp [0] and [1], merges into [2], records original in os[u, e, q], {us, es, qs, ubegs}
// this function is used to merge multiple progenitors into current node in graph based alignment, program should call row_mov(i)+row_cal(i) before row_merge(1 .. n in pairwise)
static inline void banded_striped_epi8_seqalign_piecex_row_merge(b1i *us[3], b1i *es[3], b1i *qs[3], int *ubegs[3], u4i W, int piecewise);
static inline int banded_striped_epi8_seqalign_piecex_row_verify(int rowidx, int rowoff, int mode, int W, int mov, u1i tbase, b1i *us[2], b1i *es[2], b1i *qs[2], int *ubegs[2], b1i *qprof, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2);
// find max score and return the normal position in band
static inline u4i banded_striped_epi8_seqalign_row_max(b1i *us, int *ubegs, u4i W, int *max_score);
// @return suggestion for band moving
// 0: move band toward left
// 1: keep band in diagonal
// 2: move band toward right
static inline int banded_striped_epi8_seqalign_band_mov(b1i *us, int *ubegs, u4i W, u4i tidx, u4i qoff, u4i qlen);
// combine multiple input bands into a max weight output band, used in graph alignment
// every band contributes its weight in the manner of max(v1, .., vn), and find a max weight region(band)
// the weights in a running block are all set to (ubegs[i] + ubegs[i+1]) / 2 to fast calculate
static inline u4i banded_striped_epi8_seqalign_band_comb(u4i cnt, u4i *qoffs, int **ubegs, b1v *mempool, u4i W, u4i qlen);
// get the absolute score of a position
static inline int banded_striped_epi8_seqalign_getscore(b1i *us, int *ubegs, u4i W, u8i pos);
// backtrace
// rs->qe, rs->te and rs->score MUST be set before call this function
// begs provides the band's offset of rows, it is continously suming of mov in banded_striped_epi8_seqalign_piecex_row_mov_func
static inline u4i banded_striped_epi8_seqalign_piecex_backtrace(u1i *qseq, u1i *tseq, b1i *bs, int *begs, int mode, u4i bandwidth, int piecewise, seqalign_result_t *rs, u4v *cigars);
// backcal
// restores the best path by revise calculating
static inline u4i banded_striped_epi8_seqalign_piecex_backcal(u1i *qseq, u1i *tseq, b1i *ups, b1i *eps, b1i *qps, b1i *ubs, int *roffs, int mode, u4i bandwidth, b1i *matrix, b1i gapo1, b1i gape1, b1i gapo2, b1i gape2, seqalign_result_t *rs, u4v *cigars);
static inline u4i seqalign_cigar2alnstr(u1i *qseq, u1i *tseq, seqalign_result_t *rs, u4v *cigars, char *alnstr[3], u4i length);
static inline void seqalign_cigar2alnstr_print(char *qtag, u1i *qseq, u4i qlen, char *ttag, u1i *tseq, u4i tlen, seqalign_result_t *rs, u4v *cigars, int linewidth, FILE *out);
// implementation of overlap alignment for two sequences
// bandwidth should be times of WORDSIZE
static inline seqalign_result_t banded_striped_epi8_seqalign_pairwise(u1i *qseq, u4i qlen, u1i *tseq, u4i tlen, b1v *mempool, u4v *cigars, int mode, u4i bandwidth, b1i matrix[16], b1i gapo1, b1i gape1, b1i gapo2, b1i gape2, int verbose);
static inline void _push_cigar_u4v(u4v *cigars, u1i op, u4i sz){
if(cigars->size && (cigars->buffer[cigars->size - 1] & 0xf) == op){
cigars->buffer[cigars->size - 1] += sz << 4;
} else {
push_u4v(cigars, sz << 4 | op);
}
}
static inline u4i _push_cigar_bsalign(u4v *cigars, u4i cg, u1i op, u4i sz){
if(op == (cg & 0xf)){
cg += sz << 4;
} else {
if(cigars && cg) push_u4v(cigars, cg);
cg = sz << 4 | op;
}
return cg;
}
static inline int seqalign_iter_cigars(u4i *cigars, u4i size, u8i *iter_ptr){
u4i cgoff, opoff, op, sz;
cgoff = iter_ptr[0] >> 32;
opoff = iter_ptr[0] & 0xFFFFFFFFU;
while(cgoff < size){
op = cigars[cgoff] & 0x0f;
sz = cigars[cgoff] >> 4;
if(opoff >= sz){
cgoff ++;
opoff = 0;
continue;
}
opoff ++;
iter_ptr[0] = (((u8i)cgoff) << 32) | opoff;
return op;
}
iter_ptr[0] = (((u8i)cgoff) << 32) | opoff;
return -1;
}
static inline u4i seqalign_left_tidy_cigars(u1i *qseq, u1i *tseq, seqalign_result_t *rs, u4v *cigars){
u8i iter;
u4i cg, sz, ret;
u1i alns[2][64], *seqs[2];
int op, x[3][2], z, p, q, i, L;
iter = 0;
L = 64;
seqs[0] = qseq;
seqs[1] = tseq;
x[0][0] = x[1][0] = rs->qb;
x[0][1] = x[1][1] = rs->tb;
x[2][0] = x[2][1] = z = p = 0;
sz = cigars->size;
cg = 0;
ret = 0;
inline void pop_aln2cigar(){
// left tidy
do {
if(alns[0][p] == 5){
if(alns[1][p] == 5){
q = 2;
break;
} else {
q = 0;
}
} else if(alns[1][p] == 5){
q = 1;
} else {
break;
}
for(i=1;i<z;i++){
if(alns[q][(p+i)%L] == alns[!q][p]){
alns[q][p] = alns[!q][p];
alns[q][(p+i)%L] = 5;
ret ++;
break;
} else if(alns[q][(p+i)%L] != 5){
break;
}
}
} while(0);
if(q == 2){
// skip
} if(alns[0][p] == 5){
cg = _push_cigar_bsalign(cigars, cg, SEQALIGN_CIGAR_D, 1);
} else if(alns[1][p] == 5){
cg = _push_cigar_bsalign(cigars, cg, SEQALIGN_CIGAR_I, 1);
} else {
cg = _push_cigar_bsalign(cigars, cg, SEQALIGN_CIGAR_M, 1);
}
p = (p + 1) % L;
z --;
}
while((op = seqalign_iter_cigars(cigars->buffer, sz, &iter)) != -1){
switch(op){
case 0:
case 7:
case 8:
op = 3;
break;
case 1:
case 4:
op = 1;
break;
case 2:
case 3:
op = 2;
break;
}
if(z == L){
pop_aln2cigar();
}
q = (p + z) % L;
if(z < L) z ++;
for(i=0;i<2;i++){
if(op & (1 << i)){
alns[i][q] = seqs[i][x[0][i]++];
} else {
alns[i][q] = 5;
}
}
}
while(z){
pop_aln2cigar();
}
if(cg){
push_u4v(cigars, cg);
}
remove_array_u4v(cigars, 0, sz);
return ret;
}
static inline u4i seqalign_cigar2alnstr(u1i *qseq, u1i *tseq, seqalign_result_t *rs, u4v *cigars, char *alnstr[3], u4i length){
u4i i, j, x, y, z, op, sz;
if(alnstr == NULL) return 0;
if(length == 0){
length = rs->aln;
alnstr[0] = realloc(alnstr[0], length + 1);
alnstr[1] = realloc(alnstr[1], length + 1);
alnstr[2] = realloc(alnstr[2], length + 1);
}
z = 0;
x = rs->qb;
y = rs->tb;
for(i=0;i<cigars->size;i++){
op = cigars->buffer[i] & 0xf;
sz = cigars->buffer[i] >> 4;
sz = num_min(sz, length - z);
switch(op){
case 0:
case 7:
case 8:
for(j=0;j<sz;j++){
alnstr[2][z] = (qseq[x] == tseq[y])? '|' : '*';
alnstr[0][z] = "ACGTN-"[qseq[x ++]];
alnstr[1][z] = "ACGTN-"[tseq[y ++]];
z ++;
}
break;
case 1:
case 4:
for(j=0;j<sz;j++){
alnstr[2][z] = '-';
alnstr[0][z] = "ACGTN-"[qseq[x ++]];
alnstr[1][z] = '-';
z ++;
}
break;
case 2:
case 3:
for(j=0;j<sz;j++){
alnstr[2][z] = '-';
alnstr[0][z] = '-';
alnstr[1][z] = "ACGTN-"[tseq[y ++]];
z ++;
}
}
if(z == length) break;
}
alnstr[0][z] = 0;
alnstr[1][z] = 0;
alnstr[2][z] = 0;
return z;
}
static inline void seqalign_cigar2alnstr_print(char *qtag, u1i *qseq, u4i qlen, char *ttag, u1i *tseq, u4i tlen, seqalign_result_t *rs, u4v *cigars, int linewidth, FILE *out){
char *alnstr[3];
alnstr[0] = alnstr[1] = alnstr[2] = NULL;
seqalign_cigar2alnstr(qseq, tseq, rs, cigars, alnstr, 0);
fprintf(out, "%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%0.3f\t%d\t%d\t%d\t%d\n", qtag, qlen, rs->qb, rs->qe, ttag, tlen, rs->tb, rs->te, rs->score, 1.0 * rs->mat / num_max(rs->aln, 1), rs->mat, rs->mis, rs->ins, rs->del);
if(linewidth <= 0){
fprintf(out, "%s\n%s\n%s\n", alnstr[0], alnstr[2], alnstr[1]);
} else {
int i, b, e, qn, tn;
char tmp;
qn = rs->qb;
tn = rs->tb;
for(b=0;b<rs->aln;b+=linewidth){
e = num_min(b + linewidth, rs->aln);
for(i=b;i<e;i++){
if(alnstr[0][i] != '-') qn ++;
if(alnstr[1][i] != '-') tn ++;
}
tmp = alnstr[0][e]; alnstr[0][e] = 0; fprintf(stdout, "%s\tQ[%d]\n", alnstr[0] + b, qn); alnstr[0][e] = tmp;
tmp = alnstr[2][e]; alnstr[2][e] = 0; fprintf(stdout, "%s\n", alnstr[2] + b); alnstr[2][e] = tmp;
tmp = alnstr[1][e]; alnstr[1][e] = 0; fprintf(stdout, "%s\tT[%d]\n", alnstr[1] + b, tn); alnstr[1][e] = tmp;
}
}
free(alnstr[0]);
free(alnstr[1]);
free(alnstr[2]);
}
static inline void striped_seqedit_set_query_prof(u1i *qseq, u4i qlen, u4i bandwidth, u8i *qprof){
u8i *qp, *pq;
u4i xlen, x, pos, j, W;
W = bandwidth / 64;
xlen = num_max(qlen, bandwidth);
memset(qprof, 0, 4 * (xlen + 1) * 8);
#if 1
// fill first W striped blocks
for(x=0;x<W;x++){
qp = qprof + 4 * x;
for(j=0;j<64;j++){
pos = x + j * W; // pos always less than qlen
if(pos < qlen){
qp[qseq[pos]] |= 1LLU << j;
}
}
}
// sliding
for(;x<=xlen;x++){
qp = qprof + 4 * x;
pq = qp - 4 * W;
qp[0] = pq[0] >> 1;
qp[1] = pq[1] >> 1;
qp[2] = pq[2] >> 1;
qp[3] = pq[3] >> 1;
pos = x + 63 * W;
if(pos < qlen){
qp[qseq[pos]] |= 1LLU << 63; // append the last one
}
}
#else
for(pos=0;pos<qlen;pos++){
qp = qprof + pos * 4 + qseq[pos];
x = num_min(63, pos / W);
for(j=0;j<=x;j++){
(qp - j * W * 4)[0] |= 1LLU << j;
}
}
#endif
}
static inline void striped_seqedit_row_init(u8i *us[2], u4i W){
memset(us[0], 0x00, W * sizeof(u8i));
memset(us[1], 0xFF, W * sizeof(u8i));
}
static inline void striped_seqedit_row_movx(u8i *us[2][2], u4i W, u4i movx, int mode, int *sbeg){
u8i *p1, *p2, *p3, *p4, MASK;
u4i i, mov, div, cyc;
if(seqalign_mode_type(mode) == SEQALIGN_MODE_OVERLAP){
sbeg[0] = 0;
memcpy(us[0][0], us[1][0], W * sizeof(u8i));
memcpy(us[0][1], us[1][1], W * sizeof(u8i));
return;
}
if(movx == 0){
sbeg[0] ++;
} else {
mov = num_min(movx, W * 64);
for(i=0;i<mov;i++){
sbeg[0] -= striped_seqedit_getval(us[1][0], W, i);
sbeg[0] += striped_seqedit_getval(us[1][1], W, i);
}
sbeg[0] ++;
}
if(movx == 0){
memcpy(us[0][0], us[1][0], W * sizeof(u8i));
memcpy(us[0][1], us[1][1], W * sizeof(u8i));
return;
}
if(movx >= W * 64){
memset(us[0][0], 0x00, W * sizeof(u8i));
memset(us[0][1], 0xFF, W * sizeof(u8i));
return;
}
cyc = movx / W;
mov = movx % W;
div = W - mov;
if(cyc){
MASK = 0xFFFFFFFFFFFFFFFFLLU << (64 - cyc);
p1 = us[0][0];
p2 = us[1][0] + mov;
p3 = us[0][1];
p4 = us[1][1] + mov;
for(i=0;i<div;i++){
p1[i] = p2[i] >> cyc;
p3[i] = (p4[i] >> cyc) | MASK;
}
} else {
memcpy(us[0][0], us[1][0] + mov, div * sizeof(u8i));
memcpy(us[0][1], us[1][1] + mov, div * sizeof(u8i));
}
p1 = us[0][0] + div;
p2 = us[1][0];
p3 = us[0][1] + div;
p4 = us[1][1];
if(cyc){
MASK = 0xFFFFFFFFFFFFFFFFLLU << (64 - cyc - 1);
for(i=div;i<W;i++){
p1[i] = p2[i] >> (cyc + 1);
p3[i] = (p4[i] >> (cyc + 1)) | MASK;
}
} else {
MASK = 0xFFFFFFFFFFFFFFFFLLU << (64 - 1);
for(i=0;i<mov;i++){
p1[i] = p2[i] >> 1;
p3[i] = (p4[i] >> 1) | MASK;
}
}
}
// H(x, y) = H(x - 1, y - 1) + min(s, u + 1, v + 1)
// h = H(x, y) - H(x - 1, y - 1) = min(s, u + 1, v + 1) = 0/1
// s u v = h u'
// 0(01) -1(10) -1(10) = 0(00) 1(01)
// 0(01) -1(10) 0(00) = 0(00) 0(00)
// 0(01) -1(10) 1(01) = 0(00) -1(10)
// 0(01) 0(00) -1(10) = 0(00) 1(01)
// 0(01) 0(00) 0(00) = 0(00) 0(00)
// 0(01) 0(00) 1(01) = 0(00) -1(10)
// 0(01) 1(01) -1(10) = 0(00) 1(01)
// 0(01) 1(01) 0(00) = 0(00) 0(00)
// 0(01) 1(01) 1(01) = 0(00) -1(10)
// 1(00) -1(10) -1(00) = 0(00) 1(00)
// 1(00) -1(10) 0(00) = 0(00) 0(00)
// 1(00) -1(10) 1(01) = 0(00) -1(10)
// 1(00) 0(00) -1(10) = 0(00) 1(01)
// 1(00) 0(00) 0(00) = 1(01) 1(01)
// 1(00) 0(00) 1(01) = 1(01) 0(00)
// 1(00) 1(01) -1(10) = 0(00) 1(01)
// 1(00) 1(01) 0(00) = 1(01) 1(01)
// 1(00) 1(01) 1(01) = 1(01) 0(00)
// ^^ ^^ ^^ ^^ ^^
// ab cd ef xy jk
// x = 0
// y = ~(b | c | e)
// manipulate bit by bit for u' = h - v
// h v = u'
// 0(00) 0(00) = 0(00)
// 0(00) 1(01) = -1(10)
// 0(00) -1(10) = 1(01)
// 1(01) 0(00) = 1(01)
// 1(01) 1(01) = 0(00)
// ^^ ^^ ^^
// ab cd xy
// b <- h
// c <- v1
// d <- v2
// x <- u3
// y <- u4
// x = d & (~(b))
// y = d ^ (b | c | d)
// the same will be with v' = h - u
//
static inline void striped_seqedit_row_cal(u4i rbeg, u8i *us[2][2], u8i *hs, u8i *qprof, int mode, u4i W, u1i base){
u8i s, u1, u2, u3, u4, v1, v2, h, h2;
u4i i, running;
v1 = 0x0000000000000000LLU;
v2 = (seqalign_mode_type(mode) == SEQALIGN_MODE_OVERLAP)? 0x0000000000000000LLU : 0xFFFFFFFFFFFFFFFFLLU;
for(i=0;__builtin_expect(i<W, 0);i++){
s = qprof[(rbeg + i) * 4 + base];
u1 = us[0][0][i];
u2 = us[0][1][i];
h = ~(s | u1 | v1);
u3 = (~h) & v2;
u4 = v2 ^ (h | v1 | v2);
v1 = (~h) & u2;
v2 = u2 ^ (h | u1 | u2);
hs[i] = h;
us[1][0][i] = u3;
us[1][1][i] = u4;
}
running = 1;
while(running){ // SWAT
v1 <<= 1;
v2 <<= 1;
if(seqalign_mode_type(mode) != SEQALIGN_MODE_OVERLAP){
v2 |= 1;
}
for(i=0;i<W;i++){
s = qprof[(rbeg + i) * 4 + base];
h2 = hs[i];
u1 = us[0][0][i];
u2 = us[0][1][i];
h = ~(s | u1 | v1);
u3 = (~h) & v2;
u4 = v2 ^ (h | v1 | v2);
v1 = (~h) & u2;
v2 = u2 ^ (h | u1 | u2);
hs[i] = h;
us[1][0][i] = u3;
us[1][1][i] = u4;
if(h == h2){
running = 0;
break;
}
}
}
}
// assume WORDSIZE == 16
static inline int striped_seqedit_rowmin(int _sbeg, u8i *us[2], u4i W, u4i *whence){
xint h, u1, u2, u3, m, c, p, d, hh[4], mm[4], pp[4], MASK1, MASK2, ONES;
u8i ux[2];
u4i i, blk, ib, ie, pmin;
int xs[16], sc, st, sbeg, smin;
b1i _mask1[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
b1i _mask2[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80};
//#define DEBUG_ROWMIN
#ifdef DEBUG_ROWMIN
u4i j;
int abc[1024];
b1i tmp[16];
#endif
sbeg = _sbeg;
smin = sbeg; pmin = 0;
MASK1 = mm_load((xint*)_mask1);
MASK2 = mm_load((xint*)_mask2);
ONES = mm_set1_epi8(1);
for(blk=0;blk<4;blk++){
hh[0] = hh[1] = hh[2] = hh[3] = mm_setzero();
mm[0] = mm[1] = mm[2] = mm[3] = mm_setzero();
pp[0] = pp[1] = pp[2] = pp[3] = mm_setzero();
for(ib=0;ib<W;ib=ie){
ie = num_min(ib + 124, W);
h = mm_setzero();
m = mm_setzero();
p = mm_setzero();
for(i=ib;i<ie;i++){
ux[0] = (us[0][i] >> ((blk) << 4)) & 0xFFFFU;
ux[1] = ux[0] >> 8;
ux[0] = ux[0] & 0xFF;
u3 = mm_set1_epi8(ux[0]);
u1 = mm_and(u3, MASK1);
u3 = mm_set1_epi8(ux[1]);
u3 = mm_and(u3, MASK2);
u1 = mm_or(u1, u3);
u1 = mm_cmpeq_epi8(u1, mm_setzero());
u1 = mm_andnot(u1, ONES);
#ifdef DEBUG_ROWMIN
mm_store((xint*)tmp, u1);
for(j=0;j<16;j++){
if((int)striped_seqedit_getval(us[0], W, i + (blk * 16 + j) * W) != tmp[j]){
fflush(stdout); fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
}
#endif
ux[0] = (us[1][i] >> ((blk) << 4)) & 0xFFFFU;
ux[1] = ux[0] >> 8;
ux[0] = ux[0] & 0xFF;
u3 = mm_set1_epi8(ux[0]);
u2 = mm_and(u3, MASK1);
u3 = mm_set1_epi8(ux[1]);
u3 = mm_and(u3, MASK2);
u2 = mm_or(u2, u3);
u2 = mm_cmpeq_epi8(u2, mm_setzero());
u2 = mm_andnot(u2, ONES);
#ifdef DEBUG_ROWMIN
mm_store((xint*)tmp, u2);
for(j=0;j<16;j++){
if((int)striped_seqedit_getval(us[1], W, i + (blk * 16 + j) * W) != tmp[j]){
fflush(stdout); fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
}
#endif
h = mm_subs_epi8(h, u1);
h = mm_adds_epi8(h, u2);
c = mm_cmpgt_epi8(m, h);
d = mm_set1_epi8(i - ib);
p = mm_blendv(p, d, c);
m = mm_min_epi8(m, h);
}
for(i=0;i<4;i++){
d = mm_add_epi32(hh[i], mm_cvtepi8x0_epi32(m));
c = mm_cmpgt_epi32(mm[i], d);
mm[i] = mm_min_epi32(mm[i], d);
pp[i] = mm_blendv(pp[i], mm_add_epi32(mm_cvtepi8x0_epi32(p), mm_set1_epi32(ib)), c);
hh[i] = mm_add_epi32(hh[i], mm_cvtepi8x0_epi32(h));
h = mm_srli(h, 4);
m = mm_srli(m, 4);
p = mm_srli(p, 4);
}
}
for(i=0;i<4;i++){
mm_store(((xint*)xs) + i, hh[i]);
}
for(i=0;i<16;i++){
sc = xs[i];
xs[i] = sbeg;
sbeg += sc;
}
#ifdef DEBUG_ROWMIN
for(i=0;i<16;i++){
abc[16 * blk + i] = xs[i];
fprintf(stderr, "SSE[%d] = %d\n", 16 * blk + i, xs[i]);
}
#endif
for(i=0;i<4;i++){
mm_store(hh + i, ((xint*)xs)[i]);
mm[i] = mm_add_epi32(hh[i], mm[i]);
mm_store(((xint*)xs) + i, mm[i]);
}
sc = xs[0];
st = 0;
for(i=1;i<16;i++){
if(sc > xs[i]){
sc = xs[i];
st = i;
}
}
if(sc >= smin) continue;
smin = sc;
mm_store(((xint*)xs), pp[st / 4]);
pmin = (blk * 16 + st) * W + xs[st % 4];
}
#ifdef DEBUG_ROWMIN
{
int ls = 0;
sc = st = _sbeg;
ib = 0;
for(i=0;i<W*64;i++){
if((i % W) == 0){
fprintf(stderr, "[%d] %d %d\n", i / W, st, st - ls);
if(st != abc[i / W]){
fflush(stdout); fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
ls = st;
}
st -= striped_seqedit_getval(us[0], W, i);
st += striped_seqedit_getval(us[1], W, i);
if(st < sc){
sc = st;
ib = i;
}
}
if(sc != smin){
fflush(stdout); fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
if(ib != pmin){
fflush(stdout); fprintf(stderr, " -- something wrong in %s -- %s:%d --\n", __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
abort();
}
fflush(stdout); fprintf(stderr, " -- smin=%d pmin=%d in %s -- %s:%d --\n", smin, pmin, __FUNCTION__, __FILE__, __LINE__); fflush(stderr);
}
#endif
if(whence) whence[0] = pmin;
return smin;
}
static inline seqalign_result_t striped_seqedit_backtrace(u8i *uts[2], u4i *begs, u4i W, u1i *qseq, int x, u1i *tseq, int y, int mode, u4v *cigars){
seqalign_result_t rs;
u4i cg, op, cgz;
int u1, u2, u3, u4;
ZEROS(&rs);
rs.qe = x + 1;
rs.te = y + 1;
cgz = 0;
if(cigars){
if(mode & SEQALIGN_MODE_CIGRESV){
cgz = cigars->size;
} else {
clear_u4v(cigars);
}
}
cg = op = 0;
while(x >= 0 && y >= 0){
if(qseq[x] == tseq[y]){
rs.mat ++;
op = 0;
x --;
y --;
} else {
u3 = striped_seqedit_getval(uts[0] + (y + 1) * W, W, x - begs[y + 1]);
u4 = striped_seqedit_getval(uts[1] + (y + 1) * W, W, x - begs[y + 1]);
if(u3 == 0 && u4 == 1){ // H(x - 1, y) - H(x - 1, y - 1) + 1 == 0
rs.ins ++;
op = 1; // I
x --;
} else {
u1 = striped_seqedit_getval(uts[0] + (y + 0) * W, W, x - begs[y + 0]);
u2 = striped_seqedit_getval(uts[1] + (y + 0) * W, W, x - begs[y + 0]);
if(u1 == 1 && u2 == 0){
rs.del ++;
op = 2; // D
y --;