diff --git a/SAMtags.tex b/SAMtags.tex index c155bf83f..79fb7b869 100644 --- a/SAMtags.tex +++ b/SAMtags.tex @@ -60,6 +60,7 @@ \section{Standard tags} {\tt BQ} & Z & Offset to base alignment quality (BAQ) \\ {\tt BZ} & Z & Phred quality of the unique molecular barcode bases in the {\tt OX} tag \\ {\tt CC} & Z & Reference name of the next hit \\ + {\tt CG} & B,I & BAM only: {\sf CIGAR} in BAM's binary encoding only if it consists of $>$65535 operators \\ {\tt CM} & i & Edit distance between the color sequence and the color reference (see also {\tt NM}) \\ {\tt CO} & Z & Free-text comments \\ {\tt CP} & i & Leftmost coordinate of the next hit \\ @@ -131,6 +132,13 @@ \subsection{Additional Template and Mapping data} \item[CC:Z:\tagvalue{rname}] Reference name of the next hit; `{\tt =}' for the same chromosome. +\item[CG:B:I,\tagvalue{encodedCigar}] +Real CIGAR in its binary form if it contains $>$65535 operations. This is +a BAM file only tag as a workaround of BAM's incapability to store long CIGARs +in the standard way. SAM and CRAM files created with updated tools aware of the +workaround are not expected to contain this tag. See also the footnote in +Section 4.2 of the SAM spec for details. + \item[CP:i:\tagvalue{pos}] Leftmost coordinate of the next hit. diff --git a/SAMv1.tex b/SAMv1.tex index 27646a8ac..61d7303b4 100644 --- a/SAMv1.tex +++ b/SAMv1.tex @@ -827,7 +827,7 @@ \subsection{The BAM format} & \multicolumn{2}{l|}{\sf refID} & Reference sequence ID, $-1\leq{\sf refID}<{\sf n\_ref}$; -1 for a read without a mapping position. & {\tt int32\_t} & [-1] \\\cline{2-6} & \multicolumn{2}{l|}{\sf pos} & 0-based leftmost coordinate ($=\underline{\sf POS}-1$)& {\tt int32\_t} & [-1]\\\cline{2-6} & \multicolumn{2}{l|}{\sf bin\_mq\_nl} & {\tt{\sf bin}\char60\char60 16\char124\underline{\sf MAPQ}\char60\char60 8\char124{\sf l\_read\_name}}; {\sf bin} is computed from the mapping position;\footnotemark\ {\sf l\_read\_name} is the length of {\sf read\_name} below ($={\sf length}(\underline{\sf QNAME})+1$). & {\tt uint32\_t} & \\\cline{2-6} - & \multicolumn{2}{l|}{\sf flag\_nc} & {\tt \underline{\sf FLAG}\char60\char60 16\char124{\sf n\_cigar\_op}};\footnotemark\ {\sf n\_cigar\_op} is the number of operations in \underline{\sf CIGAR}. & {\tt uint32\_t} & \\\cline{2-6} + & \multicolumn{2}{l|}{\sf flag\_nc} & {\tt \underline{\sf FLAG}\char60\char60 16\char124{\sf n\_cigar\_op}};\footnotemark\ {\sf n\_cigar\_op} is the number of operations in \underline{\sf CIGAR}\footnotemark. & {\tt uint32\_t} & \\\cline{2-6} & \multicolumn{2}{l|}{\sf l\_seq} & Length of \underline{\sf SEQ} & {\tt int32\_t} & \\\cline{2-6} & \multicolumn{2}{l|}{\sf next\_refID} & Ref-ID of the next segment ($-1\le{\sf mate\_refID}<{\sf n\_ref}$) & {\tt int32\_t} & [-1] \\\cline{2-6} & \multicolumn{2}{l|}{\sf next\_pos} & 0-based leftmost pos of the next segment ($=\underline{\sf PNEXT}-1$) & {\tt int32\_t} & [-1] \\\cline{2-6} @@ -843,7 +843,7 @@ \subsection{The BAM format} \cline{1-6} \end{tabular}} \end{table} -\addtocounter{footnote}{-4} +\addtocounter{footnote}{-5} \footnotetext{{\sf BIN} is calculated using the {\sf reg2bin()} function in Section~\ref{sec:code}. For mapped reads this uses {\sf POS-1} (i.e.,~0-based left position) and the alignment end point using the @@ -857,6 +857,19 @@ \subsection{The BAM format} \footnotetext{As noted in Section~\ref{sec:alnrecord}, reserved {\sf FLAG} bits should be written as zero and ignored on reading by current software.} \stepcounter{footnote} +\footnotetext{With 16 bits, {\sf n\_cigar\_op} can keep at most 65535 CIGAR +operations in BAM files. For an alignment with more CIGAR operations, BAM +stores the real {\sf CIGAR}, encoded the same way as the {\sf cigar} field in +BAM, in the {\tt CG} optional tag of type `{\tt B,I}', and sets {\sf CIGAR} to +`{\tt kSmN}' as a placeholder, where `{\tt k}' equals {\sf l\_seq}, `{\tt m}' +is the reference sequence length in the alignment, and `{\tt S}' and `{\tt N}' +are the soft-clipping and reference-clip CIGAR operators, respectively -- i.e. +in the binary form, {\sf n\_cigar\_op}=2 and {\sf cigar}={\tt [k\char60\char60 +4\char124{4},m\char60\char60 4\char124{3}]}. If tag {\tt CG} is present and +the first CIGAR operation clips the entire read, a BAM parsing library is +expected to update {\sf n\_cigar\_op} and {\sf cigar} with the real CIGAR +stored in the {\tt CG} tag and remove the now-redundant {\tt CG} tag.} +\stepcounter{footnote} \footnotetext{For backward compatibility, a {\sf QNAME} `{\tt *}' is stored as a C string {\tt "*\char92 0"}.} \stepcounter{footnote} \footnotetext{An integer may be stored as one of `{\tt cCsSiI}' in BAM, representing {\tt int8\_t}, {\tt uint8\_t},