From d74d1d79450018e3cd9e9766ac67fd7eee7df998 Mon Sep 17 00:00:00 2001 From: Michael Hoffman Date: Wed, 19 May 2021 15:36:06 -0400 Subject: [PATCH 1/9] Add BEDv1.tex Source: https://github.com/ga4gh/ga4gh-bed/pull/2, which pulls from michaelmhoffman/ga4gh-bed@7f13c7f453d08be9041669af42f3c8b8eb141aca [Rebased onto mainline latexmk infrastructure changes.] --- BEDv1.tex | 393 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 393 insertions(+) create mode 100644 BEDv1.tex diff --git a/BEDv1.tex b/BEDv1.tex new file mode 100644 index 000000000..91f2bf82f --- /dev/null +++ b/BEDv1.tex @@ -0,0 +1,393 @@ +\documentclass[11pt]{article} +\usepackage[T1]{fontenc} + +\usepackage[letterpaper,margin=1in]{geometry} + +\usepackage{acronym} +\usepackage{amsmath} +\usepackage{booktabs} +\usepackage[flushmargin,hang]{footmisc} +\usepackage{microtype} +\usepackage{newverbs} +\usepackage{tablefootnote} +\usepackage{tabularx} +\usepackage{todonotes} +\usepackage[hyperfootnotes=false]{hyperref} % doesn't work in tabulars as currently set +\usepackage{footnotehyper} +\usepackage[strict]{changepage} +\usepackage[binary-units=true]{siunitx} +\usepackage[mark]{gitinfo2} + +\hypersetup{colorlinks=true, + linkcolor=blue, + filecolor=magenta, + urlcolor=blue, + pdfinfo={githash=\gitHash}} + +\definecolor{cverbbg}{gray}{0.93} + +\title{The Browser Extensible Data~(BED) format} +\author{Jeffrey Niu, Danielle Denisko, Michael M.~Hoffman} +\date{\today} + +\setlength{\emergencystretch}{\hsize} +\setlength{\footnotemargin}{1em} + +\interfootnotelinepenalty=1000000 +\makesavenoteenv{tabularx} + +\newcolumntype{L}{>{\raggedright\arraybackslash}X} + +% eliminate passive voice warnings +% chktex-file -3 + +\begin{document} +\makeatletter +\renewcommand*{\AC@hyperlink}[2]{#2} % do not hyperlink acronyms +\makeatother + + +\frenchspacing + +\maketitle + +\acused{ASCII} + +\section{Specification} + +\Ac{BED} is a whitespace-delimited file format, where each~\textbf{file} consists of one or more~\textbf{line}s.\footnote{``Frequently Asked Questions: Data File Formats.'' \ac{UCSC} Genome Browser FAQ, \url{https://genome.ucsc.edu/FAQ/FAQformat.html}} +Each~\textbf{line} describes discrete genomic~\textbf{feature}s by physical start and end position on a linear~\textbf{chromosome}. +The file extension for the \ac{BED} format is~\texttt{.bed}. + +\subsection{Typographic conventions} + +This document uses the following typographic conventions: + +\vspace{2ex} + +\noindent +\begin{tabularx}{\textwidth}{r L L} + \toprule + Style & Meaning & Examples \\ + \midrule + Bold & Terms defined in subsections~\ref{sec:terms}--\ref{sec:lines} & \textbf{chromosome}{\quad}\textbf{file} \\ + Sans serif & Names of~\textbf{field}s & \textsf{chrom}{\quad}\textsf{chromStart}{\quad}\textsf{chromEnd} \\ + Fixed-width & Literals or \ac{regex}es\footnote{POSIX/IEEE~1003.1--2017 Extended Regular Expressions, for the ``C'' locale. + \emph{IEEE Standard for Information Technology---Portable Operating System Interface~(POSIX) Base Specifications}, IEEE~1003.1--2017, 2017} & \texttt{.bed}{\quad}\texttt{grep}{\quad}\texttt{[[:alnum:]]+}{\quad}\texttt{ATCG} \\ + \bottomrule +\end{tabularx} + +\subsection{Terminology and concepts}\label{sec:terms} +\begin{description} +\item[0-start, half-open coordinate system:] + A coordinate system where the first base starts at position~0, and the start of the interval is included but the end is not. + For example, for a sequence of bases~\texttt{ACTGCG}, the bases given by the interval~[2,~4) are~\texttt{TG}. % chktex 9 + +\item[BED$n$:] + A~\textbf{file} with the first $n$~\textbf{field}s of the \ac{BED} format. + For example, \textbf{BED3}~means a~\textbf{file} with only the first 3~\textbf{field}s; \textbf{BED12}~means a~\textbf{file} with all 12~\textbf{field}s. + +\item[BED$n$+:] + A~\textbf{file} that has $n$~\textbf{field}s of the \ac{BED} format, followed by any number of~\textbf{field}s of custom data defined by a user. + +\item[BED$n$+$m$:] + A~\textbf{file} that has a custom tab-delimited format starting with the first $n$~\textbf{field}s of the \ac{BED} format, followed by $m$~\textbf{field}s of custom data defined by a user. + For example, \textbf{BED6+4}~means a~\textbf{file} with the first 6~\textbf{field}s of the \ac{BED} format, followed by 4~user-defined~\textbf{field}s. + +\item[block:] + Linear subfeatures within a~\textbf{feature}. + Usually used to designate exons. + +\item[chromosome:] + A sequence of nucleobases with a name. + In this specification, ``chromosome'' may also describe a named scaffold that does not fit the biological definition of a chromosome. + Often, chromosomes are numbered starting from~\texttt{1}. + There are also often sex chromosomes such as~\texttt{W}, \texttt{X}, \texttt{Y}, and~\texttt{Z}, mitochondrial chromosomes such as~\texttt{M}, and possibly scaffolds from an unknown chromosome, often labeled~\texttt{Un}. + The name of each chromosome is often prefixed with~\texttt{chr}. + Examples of chromosome names include~\texttt{chr1}, \texttt{21}, \texttt{chrX}, \texttt{chrM}, \texttt{chrUn}, \texttt{chr19\_KI270914v1\_alt}, and~\texttt{chrUn\_KI270435v1}. + +\item[feature:] + A linear region of a~\textbf{chromosome} with specified properties. + For example, a~\textbf{file}'s~\textbf{feature}s might all be peaks called from ChIP-seq data, or transcript. + +\item[field:] + Data stored as non-tab text. + All~\textbf{field}s are 7-bit US \ac{ASCII}. + +\item[file:] + Sequence of one or more~\textbf{line}s. + +\item[line:] + String terminated by a~\textbf{line separator}, in one of the following classes. + Either a~\textbf{data line}, a~\textbf{comment line}, or a~\textbf{blank line}. + Discussed more fully in~\autoref{sec:lines} + +\item[line separator:] + Either carriage return, line feed, or carriage return followed by line feed. + The same \textbf{line separator} must be used throughout the \textbf{file}. +\end{description} + +\subsection{Lines}\label{sec:lines} + +\subsubsection{Data lines} + +Data lines contain \textbf{feature}~information. +A data line is composed of~\textbf{field}s separated by whitespace. +The whitespace must match the \ac{regex}~\texttt{[[:space:]]+}\footnote{\texttt{[[:space:]]} includes the following characters: space, form-feed, newline, carriage-return, tab, and vertical-tab}. + +\subsubsection{Comment lines and blank lines} + +Both comment lines and blank lines provide no~\textbf{feature} data. + +Comment lines start with~\texttt{\#} with no whitespace beforehand. +A~\texttt{\#} appearing anywhere else in a line is treated as~\textbf{feature} data, not a comment. + +Blank lines consist entirely of whitespace. +Both comment and blank lines may appear as any line in a~\textbf{file}, at the beginning, middle, or end of the file. +They may appear in any quantity. + +\subsection{\acs{BED} fields} + +Each~\textbf{data line} contains between~3 and 12~whitespace-delimited~\textbf{field}s. +The first 3~\textbf{field}s are mandatory, and the last 9~\textbf{field}s are optional. +In optional~\textbf{field}s, the order is binding---if 1~\textbf{field} is filled, then all previous~\textbf{field}s must also be filled. +However, \textbf{BED10} and \textbf{BED11} are prohibited. + +In a \ac{BED}~\textbf{file}, each~\textbf{data line} must have the same number of~\textbf{field}s. +The positions in \ac{BED}~\textbf{field}s are all described in the~\textbf{0-based, half-open coordinate system}. + +\begin{adjustwidth}{-0.5in}{-0.5in} + \noindent + \begin{tabularx}{\linewidth}{r l l l L} + \toprule + Col & Field & Type & Regex or range & Brief description \\ + \midrule + 1 & \textsf{chrom} & String & \texttt{[[:alnum:]\_]\{1,255\}}{\footnotemark} & \textbf{Chromosome} name \\ + 2 & \textsf{chromStart} & Int & $[0, 2^{64}-1]$ & \textbf{Feature} start position \\ + 3 & \textsf{chromEnd} & Int & $[0, 2^{64} -1]$ & \textbf{Feature} end position \\ + 4 & \textsf{name} & String & \texttt{[{\textasciicircum}{\textbackslash}t]\{0,255\}} & \textbf{Feature} description \\ + 5 & \textsf{score} & Int & $[0, 1000]$ & A numerical value \\ + 6 & \textsf{strand} & String & \texttt{[-+.]} & \textbf{Feature} strand \\ + 7 & \textsf{thickStart} & Int & $[0, 2^{64}-1]$ & Thick start position \\ + 8 & \textsf{thickEnd} & Int & $[0, 2^{64}-1]$ & Thick end position \\ + 9 & \textsf{itemRgb} & Int,Int,Int & \texttt{(}$[0, 255], [0,255], [0,255]$\texttt{) | 0} & Display color \\ % chktex 9 + 10 & \textsf{blockCount} & Int & $[0, \textsf{chromEnd}-\textsf{chromStart}]${\footnotemark} & Number of \textbf{block}s \\ + 11 & \textsf{blockSizes} & List[Int] & \texttt{([[:digit:]]+,)\{\textsf{blockCount}$-1$\}[[:digit:]]+,?}{\footnotemark} & \textbf{Block} sizes \\ + 12 & \textsf{blockStarts} & List[Int] & \texttt{([[:digit:]]+,)\{\textsf{blockCount}$-1$\}[[:digit:]]+,?} & \textbf{Block} start positions \\ + \bottomrule + \end{tabularx}\label{sec:table} + \footnotetext[5]{\texttt{[[:alnum:]\_]} is equivalent to the \ac{regex} \texttt{[A-Za-z0-9\_]}. % chktex 8 + It is also equivalent to the Perl extension \texttt{[[:word:]]}.} + \footnotetext[6]{\textsf{chromEnd}-\textsf{chromStart} is the maximum number of~\textbf{block}s that may exist without overlaps.} + \footnotetext{For example, if~$\textsf{blockCount} = 4$, then the allowed \ac{regex} would be~\texttt{([[:digit:]]+,)\{3\}[[:digit:]]+,?}} +\end{adjustwidth} + +\subsection{Coordinates} +\begin{enumerate} +\item \textsf{chrom}: The name of the~\textbf{chromosome} or scaffold where the~\textbf{feature} is present. + Limiting only to word characters only, instead of all non-whitespace characters, makes \ac{BED}~\textbf{file}s more portable to varying environments which may make different assumptions about allowed characters. + The name must be between~1 and 255~characters long, inclusive. + +\item \textsf{chromStart}: Start position of the~\textbf{feature} on the~\textbf{chromosome} or scaffold. + \textsf{chromStart}~must be an integer greater than or equal to~0 and less than the total number of bases of the~\textbf{chromosome} to which it belongs. + If the size of the~\textbf{chromosome} is unknown, then \textsf{chromStart}~must be less than or equal to~$2^{64} - 1$, which is the maximum size of an unsigned 64-bit integer. + +\item \textsf{chromEnd}: End position of the~\textbf{feature} on the~\textbf{chromosome} or scaffold. + \textsf{chromEnd}~must be an integer greater than or equal to the value of~\textsf{chromStart} and less than or equal to the total number of bases in the~\textbf{chromosome} to which it belongs. + If the size of the~\textbf{chromosome} is unknown, then \textsf{chromEnd}~must be less than or equal to~$2^{64} - 1$, the maximum size of an unsigned 64-bit integer. +\end{enumerate} + +\subsection{Simple attributes} +\begin{enumerate} + \setcounter{enumi}{3} + +\item \textsf{name}: String that describes the~\textbf{feature}. + The name must be~0 to 255~non-tab characters. + The name must not be empty or contain whitespace, unless all fields in file are delimited exclusively using single tab characters. + A visual representation of the \ac{BED} format may display the name next to the~\textbf{feature}. + +\item \textsf{score}: Integer between~0 and~1000, inclusive. + If the~\textbf{feature} has no score information, then~\texttt{0} should be used as a default value. + A visual representation of the \ac{BED} format may shade features differently depending on their score. + +\item \textsf{strand}: Strand that the~\textbf{feature} appears on. + The strand may either refer to the~\texttt{+}~(sense or coding) strand or the~\texttt{-}~(antisense or complementary) strand. + If the~\textbf{feature} has no strand information or unknown strand, then a dot~(\texttt{.}) must be used. +\end{enumerate} + +\subsection{Display attributes} +\begin{enumerate} + \setcounter{enumi}{6} + +\item \textsf{thickStart}: Start position at which the~\textbf{feature} is visualized with a thicker or accented display. + This value must be an integer between~\textsf{chromStart} and~\textsf{chromEnd}, inclusive. + There is no specified default value for~\textsf{thickStart}. + +\item \textsf{thickEnd}: End position at which the~\textbf{feature} is visualized with a thicker or accented display. + This value must be an integer greater than or equal to~\textsf{thickStart} and less than or equal to~\textsf{chromEnd}, inclusive. + In \ac{BED} files with fewer than 7~\textbf{field}s, the whole~\textbf{feature} has thick display. + In \textbf{BED7+}~files, to achieve the same effect, set \textsf{thickStart}~equal to~\textsf{chromStart} and \textsf{thickEnd}~equal to~\textsf{chromEnd}. + If this~\textbf{field} is not specified but \textsf{thickStart}~is, then the entire~\textbf{feature} has thick display. + There is no specified default value for~\textsf{thickEnd}. + +\item \textsf{itemRgb}: A triple of integers that determines the color of this~\textbf{feature} when visualized. + The triple is three integers separated by commas. + Each integer is between~0 and~255, inclusive. + To make a~\textbf{feature} black, \textsf{itemRgb}~may be a single~\texttt{0}, which is visualized identically to a~\textbf{feature} with \textsf{itemRgb} of \texttt{0,0,0}. +\end{enumerate} + +\subsection{Blocks} +\begin{enumerate} + \setcounter{enumi}{9} + +\item \textsf{blockCount}: Number of~\textbf{block}s in the~\textbf{feature}. + \textsf{blockCount}~must be an integer greater than 0. + \textsf{blockCount}~is mandatory in~\textbf{BED12+}~files. + Null or empty~\textsf{blockCount} are not allowed, because~\textsf{blockSizes} and~\textsf{blockStarts} rely on~\textsf{blockCount}. + A visual representation of the \ac{BED} format may have blocks appear thicker than the rest of the~\textbf{feature}. + +\item \textsf{blockSizes}: Comma-separated list of length~\textsf{blockCount} containing the size of each~\textbf{block}. + There must be no spaces before or after commas. + There may be a trailing comma after the last element of the list. + \textsf{blockSizes}~is mandatory in \textbf{BED12+} files. + Null or empty~\textsf{blockSizes} is not allowed, because \textsf{blockStarts}~cannot be verified without~\textsf{blockSizes}. + +\item \textsf{blockStarts}: Comma-separated list of length~\textsf{blockCount} containing each \textbf{block}'s~start position, relative to~\textsf{chromStart}. + There must not be spaces before or after the commas. + There may be a trailing comma after the last element of the list. + Each element in~\textsf{blockStarts} is paired with the corresponding element in~\textsf{blockSizes}. + Each \textsf{blockStarts}~element must be an integer between~0 and~$\textsf{chromEnd} - \textsf{chromStart}$, inclusive. + For each couple~$i$ of~$(\textsf{blockStarts}_i, \textsf{blockSizes}_i)$, the quantity~$\textsf{chromStart} + \textsf{blockStarts}_i + \textsf{blockSizes}_i$ must be less or equal to \textsf{chromEnd}. + These conditions enforce that each~\textbf{block} is contained within the~\textbf{feature}. + The first~\textbf{block} must start at~\textsf{chromStart} and the last~\textbf{block} must end at~\textsf{chromEnd}. + Moreover, the~\textbf{block}s must not overlap. + The list must be sorted in ascending order. + \textsf{blockStarts}~is mandatory in~\textbf{BED12+} files. + Null or empty~\textsf{blockStarts} is not allowed. +\end{enumerate} + +\section{Examples} + +\subsection[title]{Example BED6 file from the \acs{UCSC} Genome Browser FAQ\footnote{``Frequently + Asked Questions: Data File Formats.'' \ac{UCSC} Genome Browser FAQ, + \url{https://genome.ucsc.edu/FAQ/FAQformat.html}}}\label{sec:example-bed6} + +\begin{verbatim} +chr7 127471196 127472363 Pos1 0 + +chr7 127472363 127473530 Pos2 0 + +chr7 127473530 127474697 Pos3 0 + +chr7 127474697 127475864 Pos4 0 + +chr7 127475864 127477031 Neg1 0 - +chr7 127477031 127478198 Neg2 0 - +chr7 127478198 127479365 Neg3 0 - +chr7 127479365 127480532 Pos5 0 + +chr7 127480532 127481699 Neg4 0 - +\end{verbatim} + +\subsection{Example BED12 file from the \acs{UCSC} Genome Browser FAQ} +\begin{verbatim} +chr22 1000 5000 cloneA 960 + 1000 5000 0 2 567,488, 0,3512 +chr22 2000 6000 cloneB 900 - 2000 6000 0 2 433,399, 0,3601 +\end{verbatim} + +The~\textbf{block}s in this example satisfy the required constraints. +The first~\textbf{block} starts at~\textsf{chromStart} since the first~\textsf{blockStarts} element is~0. +The last~\textbf{block} ends at~\textsf{chromEnd} since the last~\textbf{block} starts at position 4512~(1000+3512) with size~488, and therefore ends at position 5000~(4512+488). + +\section{Recommended practice for the \acs{BED} format} + +\subsection{Mandatory fields} +\begin{itemize} +\item \textsf{chrom}: The name of each~\textbf{chromosome} should also match the names from a reference genome, if applicable. + For example, in the human genome, the chromosomes may be named~\texttt{chr1} to \texttt{chr22}, \texttt{chrX}, \texttt{chrY}, and~\texttt{chrM}. + Names should be consistent within a~\textbf{file}. + For example, one should not use both~\texttt{17} and~\texttt{chr17} to represent the same~\textbf{chromosome} in the same~\textbf{file}. +\end{itemize} + +\subsection{Optional fields}\label{sec:optional} +\begin{itemize} +\item \textsf{name}: If a feature has no name, then a dot~(\texttt{.}) should be used. + Names should avoid using the space character even if the file is exclusively delimited with single tab characters because parsers may interpret a space as a delimiter. + +\item \textsf{itemRgb}: Eight or fewer colors should be used as too many colors may slow down visualizations and are difficult for humans to distinguish.\footnote{``Frequently + Asked Questions: Data File Formats.'' \ac{UCSC} Genome Browser FAQ, + \url{https://genome.ucsc.edu/FAQ/FAQformat.html}} + +\end{itemize} + +\subsection{User-defined fields} + +Custom data \textbf{fields} may contain any non-tab 7-bit US \ac{ASCII} character. +Definitions of a custom \ac{BED} format should restrict the type of each \textbf{field} to the extent possible. +Each custom \textbf{field} should contain either one of the following data types or a comma-separated list of values of the same type: + +\noindent +\begin{tabularx}{\textwidth}{r L} + \toprule + Type & Definition \\ + \midrule + Integer & String representation of 64-bit signed integer\footnote{\emph{IEEE 754--1985 IEEE Standard for Binary Floating-Point Arithmetic.} IEEE 754--1985, 1985} \\ + Unsigned & String representation of 64-bit unsigned integer\footnotemark[10] \\ + Float & String representation of 64-bit floating point number\footnotemark[10] \\ + Character & One character, other than tab \\ + String & One or more characters, other than tab \\ + \bottomrule +\end{tabularx} + +This specification does not contain a means for interchanging custom \ac{BED} format definitions. +The AutoSQL format\footnote{Kent, W.~James. + (2000) ``AutoSQL.'' + \url{https://hgwdev.gi.ucsc.edu/~kent/exe/doc/autoSql.doc}} provides one method for defining custom \ac{BED} formats in a separate file. + +\subsection{Sorting} +\Ac{BED} \textbf{file}s should be sorted by~\textsf{chrom}, then by~\textsf{chromStart} numerically, and finally by~\textsf{chromEnd} numerically. +\textsf{chrom} may be sorted using any scheme (such as lexicographic or numeric order), but all lines with the same~\textsf{chrom} value should occur consecutively. +For example, the lexicographic order of~\texttt{chr1}, \texttt{chr10}, \texttt{chr11}, \texttt{chr12}, {\ldots}, \texttt{chr2}, \texttt{chr20}, \texttt{chr21}, {\ldots}, \texttt{chr3}, {\ldots}, \texttt{chrX}, \texttt{chrY}, \texttt{chrM} is an acceptable sorting. +The numeric order of~\texttt{chr1}, \texttt{chr2}, {\ldots}, \texttt{chr21}, \texttt{chr22}, \texttt{chrM}, \texttt{chrX}, \texttt{chrY} is also acceptable. +Regardless of the chromosome sorting scheme, lines for two features on the same chromosome should not have any lines for features on other chromosomes between them. + +\subsection{Whitespace}\label{sec:whitespace} +Though lines may use any kind of whitespace as a delimiter between~\textbf{field}s, a single tab~(\texttt{{\textbackslash}t}) should be used. +This is because almost all tools support tabs while some tools do not support other kinds of whitespace. +Also, whitespace within the~\textsf{name}~\textbf{field} may be used only if the \textbf{field}~delimiter is tab throughout the \textbf{file}. + +\subsection{Large \acs{BED} files} +If a~\textbf{file} intended for visualization is over \SI{50}{\mebi\byte} in size, the~\textbf{file} should be converted to~\texttt{bigBed} format, which is an indexed binary format.\footnote{Kent, W.~James et al. + (2010) ``BigWig and BigBed: enabling browsing of large distributed datasets.'' + \emph{Bioinformatics} 26(17):2204--2207. + \url{https://doi.org/10.1093/bioinformatics/btq351}} +The~\texttt{bedToBigBed} program may perform this conversion.\footnote{``bigBed Track Format.'' + \Ac{UCSC} Genome Browser FAQ, \url{https://genome.ucsc.edu/goldenPath/help/bigBed.html}} + +\section{\acs{UCSC} track files} + +Track files are files that contain additional information intended for a visualization tool such as the \ac{UCSC} Genome Browser.\footnote{Haeussler, Maximilian et al. + (2019) ``The \acl{UCSC} Genome Browser database: 2019 update.'' + \emph{Nucleic Acids Research} 47(D1):D853--D858. + \url{https://doi.org/10.1093/nar/gky1095}} +Track files contain browser lines and track lines that precede lines from a file format supported by the Genome Browser.\footnote{``Displaying your own annotations in the Genome Browser.'' \ac{UCSC} Genome Browser FAQ, \url{https://genome.ucsc.edu/goldenPath/help/customTrack.html\#lines}} +Track files are not valid \ac{BED} files --- valid \ac{BED} files must not have any browser or track lines. +To distinguish between \ac{BED} files and track files, track files should use the file extension~\texttt{.track}. + +\section{Acronyms} +\begin{acronym}[ASCII] + \acro{ASCII}{American Standard Code for Information Interchange} + \acro{BED}{Browser Extensible Data} + \acro{GA4GH}{Global Alliance for Genomics and Health} + \acro{regex}{regular expression} + \acro{UCSC}{University of California, Santa Cruz} +\end{acronym} + +\section{Acknowledgments} + +We thank W.~James Kent and the \ac{UCSC} Genome Browser team for creating the \ac{BED} format. +We thank W.~James Kent and Hiram Clawson~(\ac{UCSC}); Eric Roberts~(Princess Margaret Cancer Centre); John Marshall~(University of Glasgow); Aaron R.~Quinlan and Brent S.~Pedersen~(University of Utah); Ting Wang~(Washington University in St.~Louis); and the \ac{GA4GH} File Formats Task Team for comments on this specification. + +\end{document} + +% chktex-file 17 + +%%% Local Variables: +%%% mode: latex +%%% TeX-master: t +%%% End: From 780b8a1cfd233015f65a74f73d6c78fb43b02548 Mon Sep 17 00:00:00 2001 From: Michael Hoffman Date: Wed, 19 May 2021 15:59:49 -0400 Subject: [PATCH 2/9] Integrate into existing hts-specs infrastructure Add BEDv1.pdf to Makefile. Departures from previous practice in this repository: - [overrides LATEXMK_ENGINE] I developed the document with `lualatex` (included in TeX Live) as I usually do instead of `pdflatex`, because it fixes some warts especially in font selection and is where I understand TeX engine development has been focused for years. If necessary, I could change the font setup to use `pdflatex` instead. add BED to `MAINTAINERS.md` add version details from BEDv1.ver rather than using gitinfo2 add fallback `\providecommand` for `\Ac` hack package `acronym` hyperlink using option `nohyperlinks` instead make acronym list single-spaced and align to longest acronym --- BEDv1.tex | 32 +++++++++++++++++++++----------- MAINTAINERS.md | 5 +++++ Makefile | 4 ++++ 3 files changed, 30 insertions(+), 11 deletions(-) diff --git a/BEDv1.tex b/BEDv1.tex index 91f2bf82f..90c66fc7b 100644 --- a/BEDv1.tex +++ b/BEDv1.tex @@ -3,9 +3,9 @@ \usepackage[letterpaper,margin=1in]{geometry} -\usepackage{acronym} \usepackage{amsmath} \usepackage{booktabs} +\usepackage{calc} \usepackage[flushmargin,hang]{footmisc} \usepackage{microtype} \usepackage{newverbs} @@ -13,22 +13,25 @@ \usepackage{tabularx} \usepackage{todonotes} \usepackage[hyperfootnotes=false]{hyperref} % doesn't work in tabulars as currently set +\usepackage[nohyperlinks]{acronym} \usepackage{footnotehyper} \usepackage[strict]{changepage} \usepackage[binary-units=true]{siunitx} -\usepackage[mark]{gitinfo2} +\usepackage{enumitem} + +\input{BEDv1.ver} \hypersetup{colorlinks=true, linkcolor=blue, filecolor=magenta, urlcolor=blue, - pdfinfo={githash=\gitHash}} + pdfinfo={githash=\commitdesc}} \definecolor{cverbbg}{gray}{0.93} \title{The Browser Extensible Data~(BED) format} \author{Jeffrey Niu, Danielle Denisko, Michael M.~Hoffman} -\date{\today} +\date{\headdate} \setlength{\emergencystretch}{\hsize} \setlength{\footnotemargin}{1em} @@ -38,19 +41,23 @@ \newcolumntype{L}{>{\raggedright\arraybackslash}X} +\providecommand*{\Ac}[1]{\ac{#1}} % work around outdated acronym.sty packages + +\frenchspacing + % eliminate passive voice warnings % chktex-file -3 \begin{document} -\makeatletter -\renewcommand*{\AC@hyperlink}[2]{#2} % do not hyperlink acronyms -\makeatother - - -\frenchspacing \maketitle +\begin{small} +\noindent +The master version of this document can be found at \url{https://github.com/samtools/hts-specs}. +This printing is version~\commitdesc\ from that repository, last modified on the date shown above. +\end{small} + \acused{ASCII} \section{Specification} @@ -370,7 +377,10 @@ \section{\acs{UCSC} track files} To distinguish between \ac{BED} files and track files, track files should use the file extension~\texttt{.track}. \section{Acronyms} -\begin{acronym}[ASCII] + +% using the optional argument to acronym to set the label width causes it to use the list environment instead of description, which means we can't set nosep easily +\setlist[description]{labelwidth=\widthof{\textbf{GA4GH}},nosep} +\begin{acronym} \acro{ASCII}{American Standard Code for Information Interchange} \acro{BED}{Browser Extensible Data} \acro{GA4GH}{Global Alliance for Genomics and Health} diff --git a/MAINTAINERS.md b/MAINTAINERS.md index f447bc4a9..aaea10bb4 100644 --- a/MAINTAINERS.md +++ b/MAINTAINERS.md @@ -47,6 +47,11 @@ Past refget maintainers include Matt Laird. * Alexander Senf (@AlexanderSenf) * Robert Davies (@daviesrob) +### BED + +* Michael Hoffman (@michaelmhoffman) +* Aaron Quinlan (@arq5x) + [ga4gh-ff]: https://www.ga4gh.org/howwework/workstreams/#lsg diff --git a/Makefile b/Makefile index 30cb1139f..94b61bf52 100644 --- a/Makefile +++ b/Makefile @@ -2,6 +2,7 @@ all: pdf PDFS = BCFv1_qref.pdf \ BCFv2_qref.pdf \ + BEDv1.pdf \ CRAMv2.1.pdf \ CRAMv3.pdf \ crypt4gh.pdf \ @@ -19,6 +20,7 @@ pdf: $(PDFS:%=new/%) %.pdf: new/%.pdf cp $^ $@ +new/BEDv1.pdf diff/BEDv1.pdf: BEDv1.tex new/BEDv1.ver new/CRAMv2.1.pdf diff/CRAMv2.1.pdf: CRAMv2.1.tex new/CRAMv2.1.ver new/CRAMv3.pdf diff/CRAMv3.pdf: CRAMv3.tex new/CRAMv3.ver new/crypt4gh.pdf diff/crypt4gh.pdf: crypt4gh.tex new/crypt4gh.ver @@ -40,6 +42,8 @@ LATEXMK_FLAGS = new/%.pdf: %.tex $(LATEXMK) --output-directory=new $< +new/BEDv1.pdf: LATEXMK_ENGINE = --lualatex + new/CRAMv2.1.ver new/CRAMv3.ver: img/CRAMFileFormat2-1-fig001.png img/CRAMFileFormat2-1-fig002.png img/CRAMFileFormat2-1-fig003.png img/CRAMFileFormat2-1-fig004.png img/CRAMFileFormat2-1-fig005.png img/CRAMFileFormat2-1-fig006.png img/CRAMFileFormat2-1-fig007.png new/VCFv4.1.ver new/VCFv4.2.ver new/VCFv4.3.ver new/VCFv4.4.draft.ver:: img/all_orientations-400x296.png img/derivation-400x267.png img/erosion-400x211.png img/inserted_contig-400x247.png img/inserted_sequence-400x189.png img/inversion-400x95.png img/microhomology-400x248.png img/multiple_mates-400x280.png img/phasing-400x259.png img/reciprocal_rearrangement-400x192.png img/telomere-400x251.png From 70c88c33f0980bd37f0f5c460d9ff8573af9548a Mon Sep 17 00:00:00 2001 From: Jeffrey Niu Date: Mon, 28 Jun 2021 10:46:22 -0400 Subject: [PATCH 3/9] Edits in response to public comments 2021-06-09 through 2021-06-28 Addresses public comments received on samtools/hts-specs#570: - [x] change the field separator to not include newline or carriage return ([thanks](#issuecomment-857452438) @simonbrent) - [x] "line feed" -> "newline" ([thanks](#issuecomment-857452438) @simonbrent) - [x] "carriage-return" -> "carriage return" ([thanks](#issuecomment-857452438) @simonbrent) - [x] define "newline" on first use as `'\n'` ([thanks](#issuecomment-857452438) @simonbrent) - [x] clarify that when `chromEnd` is equal to `chromStart` represents a zero-length feature, which is a feature between two bases such as an insertion. `chromStart=0`, `chromEnd=0` represents an insertion before the first nucleotide of a chromosome ([thanks](#issuecomment-857460372) @simonbrent) - [x] specify that fields, including custom fields, can only be empty when a single tab is used as the delimiter ([thanks](#issuecomment-857475922) @pdl) - [x] specify that field data is ASCII printable characters only---the range `'\x20'` to `'\x7e'` ([thanks](#issuecomment-857475922) @pdl) - [x] `name`: change regex so that it cannot be empty ([thanks](#issuecomment-857475922) @pdl) - [x] `score`: clarify that `0` should be used in BED5+ files where a `score` attribute of features would be uninformative ([thanks](#issuecomment-857475922) @pdl) - [x] `strand`: - [x] clarify that `.` is the default value, and that a parser should treat BED5 files as if they have `strand=.` - [x] explicitly specify that it cannot be empty in a BED6+ file ([thanks](#issuecomment-857475922) @pdl) - [x] `thickEnd`: clarify that the field is not specified but `thickStart` is means BED files that are not BED8+ ([thanks](#issuecomment-857475922) @pdl) - [x] change "null or empty" to only "empty" ([thanks](#issuecomment-857475922) @pdl) - [x] add recommendation to use colorblind-friendly color schemes, and especially to avoid red-green color schemes ([thanks](#issuecomment-857788414) @JspSrs) - [x] sorting: - [x] add that arbitrary orderings of `chrom` are allowed as long as all lines with the same `chrom` value occur consecutively ([thanks](#issuecomment-857955521) @ZhenyuZ) - [x] specify that multiple features with the same `chrom`, `chromStart`, and `chromEnd` may appear in any order - [x] add a section, before "UCSC track files", discussing information that is supplied out-of-band. This should include - [x] which of the first 4-12 fields are standard BED fields and which are custom fields ([thanks](#issuecomment-857475922) @pdl) - [x] genome assembly used - [x] semantics of `score`, `itemRgb`, thick vs. thin positions, block vs. non-block positions - [x] definitions of custom fields - [x] whether tab is the only delimiter between fields ([thanks](#issuecomment-857475922) @pdl, [thanks](#issuecomment-857508028) @andrewyatz) - [x] add acknowledgments to any of the folks thanked in the above checklist, with full name and affiliation Additionally, define "field separator", remove form feed and vertical tab from valid field separators, specify "data line" instead of "line" in several places, and correct some places where boldface was not used but should be. Co-authored-by: Michael Hoffman --- BEDv1.tex | 126 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 78 insertions(+), 48 deletions(-) diff --git a/BEDv1.tex b/BEDv1.tex index 90c66fc7b..21c84e1f6 100644 --- a/BEDv1.tex +++ b/BEDv1.tex @@ -42,6 +42,7 @@ \newcolumntype{L}{>{\raggedright\arraybackslash}X} \providecommand*{\Ac}[1]{\ac{#1}} % work around outdated acronym.sty packages +\newcommand*{\acrodefused}[2]{\acrodef{#1}{#2}\acused{#1}} \frenchspacing @@ -63,7 +64,7 @@ \section{Specification} \Ac{BED} is a whitespace-delimited file format, where each~\textbf{file} consists of one or more~\textbf{line}s.\footnote{``Frequently Asked Questions: Data File Formats.'' \ac{UCSC} Genome Browser FAQ, \url{https://genome.ucsc.edu/FAQ/FAQformat.html}} -Each~\textbf{line} describes discrete genomic~\textbf{feature}s by physical start and end position on a linear~\textbf{chromosome}. +Data are in~\textbf{data line}s, which describe discrete genomic~\textbf{feature}s by physical start and end position on a linear~\textbf{chromosome}. The file extension for the \ac{BED} format is~\texttt{.bed}. \subsection{Typographic conventions} @@ -98,7 +99,7 @@ \subsection{Terminology and concepts}\label{sec:terms} A~\textbf{file} that has $n$~\textbf{field}s of the \ac{BED} format, followed by any number of~\textbf{field}s of custom data defined by a user. \item[BED$n$+$m$:] - A~\textbf{file} that has a custom tab-delimited format starting with the first $n$~\textbf{field}s of the \ac{BED} format, followed by $m$~\textbf{field}s of custom data defined by a user. + A~\textbf{file} that has a custom format starting with the first $n$~\textbf{field}s of the \ac{BED} format, followed by $m$~\textbf{field}s of custom data defined by a user. For example, \textbf{BED6+4}~means a~\textbf{file} with the first 6~\textbf{field}s of the \ac{BED} format, followed by 4~user-defined~\textbf{field}s. \item[block:] @@ -108,10 +109,10 @@ \subsection{Terminology and concepts}\label{sec:terms} \item[chromosome:] A sequence of nucleobases with a name. In this specification, ``chromosome'' may also describe a named scaffold that does not fit the biological definition of a chromosome. - Often, chromosomes are numbered starting from~\texttt{1}. - There are also often sex chromosomes such as~\texttt{W}, \texttt{X}, \texttt{Y}, and~\texttt{Z}, mitochondrial chromosomes such as~\texttt{M}, and possibly scaffolds from an unknown chromosome, often labeled~\texttt{Un}. - The name of each chromosome is often prefixed with~\texttt{chr}. - Examples of chromosome names include~\texttt{chr1}, \texttt{21}, \texttt{chrX}, \texttt{chrM}, \texttt{chrUn}, \texttt{chr19\_KI270914v1\_alt}, and~\texttt{chrUn\_KI270435v1}. + Often, \textbf{chromosome}s are numbered starting from~\texttt{1}. + There are also often sex \textbf{chromosome}s such as~\texttt{W}, \texttt{X}, \texttt{Y}, and~\texttt{Z}, mitochondrial \textbf{chromosome}s such as~\texttt{M}, and possibly scaffolds from an unknown chromosome, often labeled~\texttt{Un}. + The name of each \textbf{chromosome} is often prefixed with~\texttt{chr}. + Examples of \textbf{chromosome} names include~\texttt{chr1}, \texttt{21}, \texttt{chrX}, \texttt{chrM}, \texttt{chrUn}, \texttt{chr19\_KI270914v1\_alt}, and~\texttt{chrUn\_KI270435v1}. \item[feature:] A linear region of a~\textbf{chromosome} with specified properties. @@ -119,7 +120,12 @@ \subsection{Terminology and concepts}\label{sec:terms} \item[field:] Data stored as non-tab text. - All~\textbf{field}s are 7-bit US \ac{ASCII}. + All~\textbf{field}s are 7-bit US \ac{ASCII} printable characters\footnote{Characters in the range `\texttt{{\textbackslash}x20}' to `\texttt{{\textbackslash}x7e}', therefore not including any control characters}. + Only some \textbf{field}s can be empty, and they can only be empty when a single tab is used as the \textbf{field separator}. + +\item[field separator:] + One or more horizontal whitespace characters (space or tab). + The \textbf{field} separator must match the \ac{regex}~\texttt{[ {\textbackslash}]+}. \item[file:] Sequence of one or more~\textbf{line}s. @@ -130,7 +136,7 @@ \subsection{Terminology and concepts}\label{sec:terms} Discussed more fully in~\autoref{sec:lines} \item[line separator:] - Either carriage return, line feed, or carriage return followed by line feed. + Either carriage return, newline, or carriage return followed by newline\footnote{A newline is defined as `\texttt{{\textbackslash}n}'} The same \textbf{line separator} must be used throughout the \textbf{file}. \end{description} @@ -138,24 +144,23 @@ \subsection{Lines}\label{sec:lines} \subsubsection{Data lines} -Data lines contain \textbf{feature}~information. -A data line is composed of~\textbf{field}s separated by whitespace. -The whitespace must match the \ac{regex}~\texttt{[[:space:]]+}\footnote{\texttt{[[:space:]]} includes the following characters: space, form-feed, newline, carriage-return, tab, and vertical-tab}. +\textbf{Data line}s contain \textbf{feature}~information. +A \textbf{data line} is composed of~\textbf{field}s separated by \textbf{field separator}s. \subsubsection{Comment lines and blank lines} -Both comment lines and blank lines provide no~\textbf{feature} data. +Both \textbf{comment line}s and \textbf{blank line}s provide no~\textbf{feature} data. -Comment lines start with~\texttt{\#} with no whitespace beforehand. -A~\texttt{\#} appearing anywhere else in a line is treated as~\textbf{feature} data, not a comment. +\textbf{Comment line}s start with~\texttt{\#} with no horizontal whitespace beforehand. +A~\texttt{\#} appearing anywhere else in a \textbf{data line} is treated as~\textbf{feature} data, not a comment. -Blank lines consist entirely of whitespace. -Both comment and blank lines may appear as any line in a~\textbf{file}, at the beginning, middle, or end of the file. +\textbf{Blank line}s consist entirely of horizontal whitespace. +Both comment and blank \textbf{line}s may appear as any \textbf{line} in a~\textbf{file}, at the beginning, middle, or end of the \textbf{file}. They may appear in any quantity. \subsection{\acs{BED} fields} -Each~\textbf{data line} contains between~3 and 12~whitespace-delimited~\textbf{field}s. +Each~\textbf{data line} contains between~3 and 12~~\textbf{field}s delimited by a \textbf{field separator}. The first 3~\textbf{field}s are mandatory, and the last 9~\textbf{field}s are optional. In optional~\textbf{field}s, the order is binding---if 1~\textbf{field} is filled, then all previous~\textbf{field}s must also be filled. However, \textbf{BED10} and \textbf{BED11} are prohibited. @@ -172,7 +177,7 @@ \subsection{\acs{BED} fields} 1 & \textsf{chrom} & String & \texttt{[[:alnum:]\_]\{1,255\}}{\footnotemark} & \textbf{Chromosome} name \\ 2 & \textsf{chromStart} & Int & $[0, 2^{64}-1]$ & \textbf{Feature} start position \\ 3 & \textsf{chromEnd} & Int & $[0, 2^{64} -1]$ & \textbf{Feature} end position \\ - 4 & \textsf{name} & String & \texttt{[{\textasciicircum}{\textbackslash}t]\{0,255\}} & \textbf{Feature} description \\ + 4 & \textsf{name} & String & \texttt{[{\textasciicircum}{\textbackslash}t]\{1,255\}} & \textbf{Feature} description \\ 5 & \textsf{score} & Int & $[0, 1000]$ & A numerical value \\ 6 & \textsf{strand} & String & \texttt{[-+.]} & \textbf{Feature} strand \\ 7 & \textsf{thickStart} & Int & $[0, 2^{64}-1]$ & Thick start position \\ @@ -191,16 +196,18 @@ \subsection{\acs{BED} fields} \subsection{Coordinates} \begin{enumerate} -\item \textsf{chrom}: The name of the~\textbf{chromosome} or scaffold where the~\textbf{feature} is present. +\item \textsf{chrom}: The name of the~\textbf{chromosome} where the~\textbf{feature} is present. Limiting only to word characters only, instead of all non-whitespace characters, makes \ac{BED}~\textbf{file}s more portable to varying environments which may make different assumptions about allowed characters. The name must be between~1 and 255~characters long, inclusive. -\item \textsf{chromStart}: Start position of the~\textbf{feature} on the~\textbf{chromosome} or scaffold. +\item \textsf{chromStart}: Start position of the~\textbf{feature} on the~\textbf{chromosome}. \textsf{chromStart}~must be an integer greater than or equal to~0 and less than the total number of bases of the~\textbf{chromosome} to which it belongs. If the size of the~\textbf{chromosome} is unknown, then \textsf{chromStart}~must be less than or equal to~$2^{64} - 1$, which is the maximum size of an unsigned 64-bit integer. -\item \textsf{chromEnd}: End position of the~\textbf{feature} on the~\textbf{chromosome} or scaffold. +\item \textsf{chromEnd}: End position of the~\textbf{feature} on the~\textbf{chromosome}. \textsf{chromEnd}~must be an integer greater than or equal to the value of~\textsf{chromStart} and less than or equal to the total number of bases in the~\textbf{chromosome} to which it belongs. + If \textsf{chromEnd}~is equal to~\textsf{chromStart}, this indicates a \textbf{feature} between \textsf{chromStart} and the preceding base, such as an insertion. + When \textsf{chromStart} and \textsf{chromEnd} are both~0, this indicates a feature before the entire~\textbf{chromosome}. If the size of the~\textbf{chromosome} is unknown, then \textsf{chromEnd}~must be less than or equal to~$2^{64} - 1$, the maximum size of an unsigned 64-bit integer. \end{enumerate} @@ -209,17 +216,19 @@ \subsection{Simple attributes} \setcounter{enumi}{3} \item \textsf{name}: String that describes the~\textbf{feature}. - The name must be~0 to 255~non-tab characters. - The name must not be empty or contain whitespace, unless all fields in file are delimited exclusively using single tab characters. - A visual representation of the \ac{BED} format may display the name next to the~\textbf{feature}. + \textsf{name} must be~1 to 255~non-tab characters. + \textsf{name} must not be empty or contain whitespace, unless the only \textbf{field separator} is a single tab. + A visual representation of the \ac{BED} format may display \textsf{name} next to the~\textbf{feature}. \item \textsf{score}: Integer between~0 and~1000, inclusive. - If the~\textbf{feature} has no score information, then~\texttt{0} should be used as a default value. - A visual representation of the \ac{BED} format may shade features differently depending on their score. + In~\textbf{BED6+} \textbf{file}s where all \textbf{feature}s have uninformative \textsf{score}s, \texttt{0} should be used as the \textsf{score} on every \textbf{data line}. + A visual representation of the \ac{BED} format may shade \textbf{feature}s differently depending on their \textsf{score}. \item \textsf{strand}: Strand that the~\textbf{feature} appears on. - The strand may either refer to the~\texttt{+}~(sense or coding) strand or the~\texttt{-}~(antisense or complementary) strand. - If the~\textbf{feature} has no strand information or unknown strand, then a dot~(\texttt{.}) must be used. + The \textsf{strand} may either refer to the~\texttt{+}~(sense or coding) strand or the~\texttt{-}~(antisense or complementary) strand. + If the~\textbf{feature} has no \textsf{strand} information or unknown \textsf{strand}, then a dot~(\texttt{.}) must be used as a default value. + \textsf{strand} cannot be empty in \textbf{BED6+} \textbf{file}s. + A parser should treat \textbf{files} that are not \textbf{BED6+} as if \textsf{strand} where \texttt{.}. \end{enumerate} \subsection{Display attributes} @@ -232,10 +241,10 @@ \subsection{Display attributes} \item \textsf{thickEnd}: End position at which the~\textbf{feature} is visualized with a thicker or accented display. This value must be an integer greater than or equal to~\textsf{thickStart} and less than or equal to~\textsf{chromEnd}, inclusive. - In \ac{BED} files with fewer than 7~\textbf{field}s, the whole~\textbf{feature} has thick display. - In \textbf{BED7+}~files, to achieve the same effect, set \textsf{thickStart}~equal to~\textsf{chromStart} and \textsf{thickEnd}~equal to~\textsf{chromEnd}. + In \ac{BED} \textbf{file}s with fewer than 7~\textbf{field}s, the whole~\textbf{feature} has thick display. + In \textbf{BED7+}~\textbf{file}s, to achieve the same effect, set \textsf{thickStart}~equal to~\textsf{chromStart} and \textsf{thickEnd}~equal to~\textsf{chromEnd}. If this~\textbf{field} is not specified but \textsf{thickStart}~is, then the entire~\textbf{feature} has thick display. - There is no specified default value for~\textsf{thickEnd}. + For \textbf{BED7+} \textbf{file}s that are not \textbf{BED8+}, there is no specified default value for~\textsf{thickEnd}. \item \textsf{itemRgb}: A triple of integers that determines the color of this~\textbf{feature} when visualized. The triple is three integers separated by commas. @@ -249,15 +258,15 @@ \subsection{Blocks} \item \textsf{blockCount}: Number of~\textbf{block}s in the~\textbf{feature}. \textsf{blockCount}~must be an integer greater than 0. - \textsf{blockCount}~is mandatory in~\textbf{BED12+}~files. - Null or empty~\textsf{blockCount} are not allowed, because~\textsf{blockSizes} and~\textsf{blockStarts} rely on~\textsf{blockCount}. + \textsf{blockCount}~is mandatory in~\textbf{BED12+}~\textbf{file}s. + Empty~\textsf{blockCount} are not allowed, because~\textsf{blockSizes} and~\textsf{blockStarts} rely on~\textsf{blockCount}. A visual representation of the \ac{BED} format may have blocks appear thicker than the rest of the~\textbf{feature}. \item \textsf{blockSizes}: Comma-separated list of length~\textsf{blockCount} containing the size of each~\textbf{block}. There must be no spaces before or after commas. There may be a trailing comma after the last element of the list. - \textsf{blockSizes}~is mandatory in \textbf{BED12+} files. - Null or empty~\textsf{blockSizes} is not allowed, because \textsf{blockStarts}~cannot be verified without~\textsf{blockSizes}. + \textsf{blockSizes}~is mandatory in \textbf{BED12+} \textbf{file}s. + Empty~\textsf{blockSizes} is not allowed, because \textsf{blockStarts}~cannot be verified without~\textsf{blockSizes}. \item \textsf{blockStarts}: Comma-separated list of length~\textsf{blockCount} containing each \textbf{block}'s~start position, relative to~\textsf{chromStart}. There must not be spaces before or after the commas. @@ -269,8 +278,8 @@ \subsection{Blocks} The first~\textbf{block} must start at~\textsf{chromStart} and the last~\textbf{block} must end at~\textsf{chromEnd}. Moreover, the~\textbf{block}s must not overlap. The list must be sorted in ascending order. - \textsf{blockStarts}~is mandatory in~\textbf{BED12+} files. - Null or empty~\textsf{blockStarts} is not allowed. + \textsf{blockStarts}~is mandatory in~\textbf{BED12+} \textbf{file}s. + Empty~\textsf{blockStarts} is not allowed. \end{enumerate} \section{Examples} @@ -306,25 +315,28 @@ \section{Recommended practice for the \acs{BED} format} \subsection{Mandatory fields} \begin{itemize} \item \textsf{chrom}: The name of each~\textbf{chromosome} should also match the names from a reference genome, if applicable. - For example, in the human genome, the chromosomes may be named~\texttt{chr1} to \texttt{chr22}, \texttt{chrX}, \texttt{chrY}, and~\texttt{chrM}. + For example, in the human genome, the \textbf{chromosome}s may be named~\texttt{chr1} to \texttt{chr22}, \texttt{chrX}, \texttt{chrY}, and~\texttt{chrM}. Names should be consistent within a~\textbf{file}. For example, one should not use both~\texttt{17} and~\texttt{chr17} to represent the same~\textbf{chromosome} in the same~\textbf{file}. \end{itemize} \subsection{Optional fields}\label{sec:optional} \begin{itemize} -\item \textsf{name}: If a feature has no name, then a dot~(\texttt{.}) should be used. - Names should avoid using the space character even if the file is exclusively delimited with single tab characters because parsers may interpret a space as a delimiter. +\item \textsf{name}: If a \textbf{feature} has no name, then a dot~(\texttt{.}) should be used. + Names should avoid using the space character even if the only \textbf{field separator} is a single tab character, because parsers may interpret a space as a \textbf{field separator}. \item \textsf{itemRgb}: Eight or fewer colors should be used as too many colors may slow down visualizations and are difficult for humans to distinguish.\footnote{``Frequently Asked Questions: Data File Formats.'' \ac{UCSC} Genome Browser FAQ, \url{https://genome.ucsc.edu/FAQ/FAQformat.html}} + Color schemes should be colorblind-friendly. + Red-green color schemes should be avoided. \end{itemize} \subsection{User-defined fields} -Custom data \textbf{fields} may contain any non-tab 7-bit US \ac{ASCII} character. +Custom data \textbf{fields} may contain any printable 7-bit US \ac{ASCII} character (which includes spaces, but excludes tabs, newlines, and other control characters). +Custom data \textbf{fields} can only be empty when a single tab is used as the \textbf{field separator} throughout the \textbf{file}. Definitions of a custom \ac{BED} format should restrict the type of each \textbf{field} to the extent possible. Each custom \textbf{field} should contain either one of the following data types or a comma-separated list of values of the same type: @@ -348,15 +360,16 @@ \subsection{User-defined fields} \subsection{Sorting} \Ac{BED} \textbf{file}s should be sorted by~\textsf{chrom}, then by~\textsf{chromStart} numerically, and finally by~\textsf{chromEnd} numerically. -\textsf{chrom} may be sorted using any scheme (such as lexicographic or numeric order), but all lines with the same~\textsf{chrom} value should occur consecutively. +\textsf{chrom} may be sorted using any scheme (such as lexicographic or numeric order), but all \textbf{data line}s with the same~\textsf{chrom} value should occur consecutively. For example, the lexicographic order of~\texttt{chr1}, \texttt{chr10}, \texttt{chr11}, \texttt{chr12}, {\ldots}, \texttt{chr2}, \texttt{chr20}, \texttt{chr21}, {\ldots}, \texttt{chr3}, {\ldots}, \texttt{chrX}, \texttt{chrY}, \texttt{chrM} is an acceptable sorting. The numeric order of~\texttt{chr1}, \texttt{chr2}, {\ldots}, \texttt{chr21}, \texttt{chr22}, \texttt{chrM}, \texttt{chrX}, \texttt{chrY} is also acceptable. -Regardless of the chromosome sorting scheme, lines for two features on the same chromosome should not have any lines for features on other chromosomes between them. +Arbitrary orderings of~\textsf{chrom} are allowed, but regardless of the \textbf{chromosome} sorting scheme, \textbf{data line}s for two \textbf{feature}s on the same \textbf{chromosome} should not have any \textbf{data line}s for \textbf{feature}s on other \textbf{chromosome}s between them. +Multiple \textbf{feature}s that have the same~\textsf{chrom}, \textsf{chromStart}, and \textsf{chromEnd} can appear in any order. \subsection{Whitespace}\label{sec:whitespace} -Though lines may use any kind of whitespace as a delimiter between~\textbf{field}s, a single tab~(\texttt{{\textbackslash}t}) should be used. +Though \textbf{data line}s may use any kind of horizontal whitespace as a delimiter between~\textbf{field}s, a single tab~(\texttt{{\textbackslash}t}) should be used. This is because almost all tools support tabs while some tools do not support other kinds of whitespace. -Also, whitespace within the~\textsf{name}~\textbf{field} may be used only if the \textbf{field}~delimiter is tab throughout the \textbf{file}. +Also, spaces within the~\textsf{name}~\textbf{field} may be used only if the \textbf{field}~delimiter is tab throughout the \textbf{file}. \subsection{Large \acs{BED} files} If a~\textbf{file} intended for visualization is over \SI{50}{\mebi\byte} in size, the~\textbf{file} should be converted to~\texttt{bigBed} format, which is an indexed binary format.\footnote{Kent, W.~James et al. @@ -366,6 +379,20 @@ \subsection{Large \acs{BED} files} The~\texttt{bedToBigBed} program may perform this conversion.\footnote{``bigBed Track Format.'' \Ac{UCSC} Genome Browser FAQ, \url{https://genome.ucsc.edu/goldenPath/help/bigBed.html}} +\section{Information supplied out-of-band} + +Some information about a \ac{BED} \textbf{file} can only be supplied unambiguously separately from the \textbf{data line}s of the \ac{BED} \textbf{file}. +This specification does not contain a means for interchanging this information. +Information that must be supplied out-of-band include: + +\begin{itemize} + \item Which of the first~4 to 12~\textbf{fields} are standard \ac{BED} \textbf{fields} and which are custom \textbf{field}s. + \item The genome assembly that define \textsf{chrom}, \textsf{chromStart}, and \textsf{chromEnd}. + \item The semantics of \textbf{fields} such as \textsf{score}, \textsf{itemRgb}, thick vs.~thin positions, and block vs.~non-block positions. + \item The definitions of custom \textbf{field}s. + \item Whether the \textbf{field separator} is a single tab character. +\end{itemize} + \section{\acs{UCSC} track files} Track files are files that contain additional information intended for a visualization tool such as the \ac{UCSC} Genome Browser.\footnote{Haeussler, Maximilian et al. @@ -373,8 +400,8 @@ \section{\acs{UCSC} track files} \emph{Nucleic Acids Research} 47(D1):D853--D858. \url{https://doi.org/10.1093/nar/gky1095}} Track files contain browser lines and track lines that precede lines from a file format supported by the Genome Browser.\footnote{``Displaying your own annotations in the Genome Browser.'' \ac{UCSC} Genome Browser FAQ, \url{https://genome.ucsc.edu/goldenPath/help/customTrack.html\#lines}} -Track files are not valid \ac{BED} files --- valid \ac{BED} files must not have any browser or track lines. -To distinguish between \ac{BED} files and track files, track files should use the file extension~\texttt{.track}. +Track files are not valid \ac{BED} \textbf{file}s --- valid \ac{BED} \textbf{file}s must not have any browser or track lines. +To distinguish between \ac{BED} \textbf{file}s and track files, track files should use the file extension~\texttt{.track}. \section{Acronyms} @@ -388,10 +415,13 @@ \section{Acronyms} \acro{UCSC}{University of California, Santa Cruz} \end{acronym} +\acrodefused{EMBL}{European Molecular Biology Laboratory} + \section{Acknowledgments} We thank W.~James Kent and the \ac{UCSC} Genome Browser team for creating the \ac{BED} format. -We thank W.~James Kent and Hiram Clawson~(\ac{UCSC}); Eric Roberts~(Princess Margaret Cancer Centre); John Marshall~(University of Glasgow); Aaron R.~Quinlan and Brent S.~Pedersen~(University of Utah); Ting Wang~(Washington University in St.~Louis); and the \ac{GA4GH} File Formats Task Team for comments on this specification. +We thank W.~James Kent and Hiram Clawson~(\ac{UCSC}); Eric Roberts~(University Health Network); John Marshall~(University of Glasgow); Aaron R.~Quinlan and Brent S.~Pedersen~(University of Utah); Ting Wang~(Washington University in St.~Louis); Daniel Perrett and Simon Brent (Wellcome Sanger Institute); Jasper Saris (Erasmus Medical Center); Zhenyu Zhang (University of Chicago); Andrew Yates (\ac{EMBL}---European Bioinformatics Institute); and the \ac{GA4GH} File Formats Task Team for comments on this specification. + \end{document} From c80cf077f07ed91237a66b64ec1f91c02400a13b Mon Sep 17 00:00:00 2001 From: Michael Hoffman Date: Mon, 28 Jun 2021 11:09:35 -0400 Subject: [PATCH 4/9] add spacing around tabulars [20210628 draft] --- BEDv1.tex | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/BEDv1.tex b/BEDv1.tex index 21c84e1f6..1c836dea7 100644 --- a/BEDv1.tex +++ b/BEDv1.tex @@ -18,6 +18,7 @@ \usepackage[strict]{changepage} \usepackage[binary-units=true]{siunitx} \usepackage{enumitem} +\usepackage{stackengine} \input{BEDv1.ver} @@ -74,7 +75,7 @@ \subsection{Typographic conventions} \vspace{2ex} \noindent -\begin{tabularx}{\textwidth}{r L L} +\addstackgap[\baselineskip]{\begin{tabularx}{\textwidth}{r L L} \toprule Style & Meaning & Examples \\ \midrule @@ -83,7 +84,7 @@ \subsection{Typographic conventions} Fixed-width & Literals or \ac{regex}es\footnote{POSIX/IEEE~1003.1--2017 Extended Regular Expressions, for the ``C'' locale. \emph{IEEE Standard for Information Technology---Portable Operating System Interface~(POSIX) Base Specifications}, IEEE~1003.1--2017, 2017} & \texttt{.bed}{\quad}\texttt{grep}{\quad}\texttt{[[:alnum:]]+}{\quad}\texttt{ATCG} \\ \bottomrule -\end{tabularx} +\end{tabularx}} \subsection{Terminology and concepts}\label{sec:terms} \begin{description} @@ -170,7 +171,7 @@ \subsection{\acs{BED} fields} \begin{adjustwidth}{-0.5in}{-0.5in} \noindent - \begin{tabularx}{\linewidth}{r l l l L} + \addstackgap[\baselineskip]{\begin{tabularx}{\linewidth}{r l l l L} \toprule Col & Field & Type & Regex or range & Brief description \\ \midrule @@ -187,7 +188,7 @@ \subsection{\acs{BED} fields} 11 & \textsf{blockSizes} & List[Int] & \texttt{([[:digit:]]+,)\{\textsf{blockCount}$-1$\}[[:digit:]]+,?}{\footnotemark} & \textbf{Block} sizes \\ 12 & \textsf{blockStarts} & List[Int] & \texttt{([[:digit:]]+,)\{\textsf{blockCount}$-1$\}[[:digit:]]+,?} & \textbf{Block} start positions \\ \bottomrule - \end{tabularx}\label{sec:table} + \end{tabularx}} \footnotetext[5]{\texttt{[[:alnum:]\_]} is equivalent to the \ac{regex} \texttt{[A-Za-z0-9\_]}. % chktex 8 It is also equivalent to the Perl extension \texttt{[[:word:]]}.} \footnotetext[6]{\textsf{chromEnd}-\textsf{chromStart} is the maximum number of~\textbf{block}s that may exist without overlaps.} @@ -341,7 +342,7 @@ \subsection{User-defined fields} Each custom \textbf{field} should contain either one of the following data types or a comma-separated list of values of the same type: \noindent -\begin{tabularx}{\textwidth}{r L} +\addstackgap[\baselineskip]{\begin{tabularx}{\textwidth}{r L} \toprule Type & Definition \\ \midrule @@ -351,7 +352,7 @@ \subsection{User-defined fields} Character & One character, other than tab \\ String & One or more characters, other than tab \\ \bottomrule -\end{tabularx} +\end{tabularx}} This specification does not contain a means for interchanging custom \ac{BED} format definitions. The AutoSQL format\footnote{Kent, W.~James. From 01a9d8e6d2b25d6879e832c01c35a81fe2a6419f Mon Sep 17 00:00:00 2001 From: Jeffrey Niu Date: Mon, 30 Aug 2021 15:24:18 -0400 Subject: [PATCH 5/9] Edits in response to public comments 2021-06-28 through 2021-07-09 [20210830 draft] * Update BEDv1.tex * polished edits * Edits in response to public comments 2021-06-28 through 2021-07-09 * Edits in response to public comments 2021-06-28 through 2021-07-09 fix typo * Edits in reponse to GA4GH PRC * line edit * Edits in response to public comments 2021-07-09 * further line edits and footnote fixing * fix texlint issues * WIP address uninformative/default/empty issues * fix empty/uninformative issues * clarify language on BED fields not being empty * add special-case `diff/BEDv1.pdf` target that uses lualatex * fix minor typo Co-authored-by: Michael Hoffman --- BEDv1.tex | 245 +++++++++++++++++++++++++++++++++--------------------- Makefile | 2 + 2 files changed, 153 insertions(+), 94 deletions(-) diff --git a/BEDv1.tex b/BEDv1.tex index 1c836dea7..92f26bedf 100644 --- a/BEDv1.tex +++ b/BEDv1.tex @@ -6,7 +6,9 @@ \usepackage{amsmath} \usepackage{booktabs} \usepackage{calc} +\usepackage{caption} \usepackage[flushmargin,hang]{footmisc} +\usepackage{float} \usepackage{microtype} \usepackage{newverbs} \usepackage{tablefootnote} @@ -30,13 +32,22 @@ \definecolor{cverbbg}{gray}{0.93} -\title{The Browser Extensible Data~(BED) format} +\title{The \acf{BED} format} \author{Jeffrey Niu, Danielle Denisko, Michael M.~Hoffman} \date{\headdate} \setlength{\emergencystretch}{\hsize} \setlength{\footnotemargin}{1em} +\floatplacement{table}{htbp} +\setcounter{topnumber}{2} +\setcounter{bottomnumber}{2} +\setcounter{totalnumber}{4} +\setcounter{dbltopnumber}{2} +\renewcommand{\dbltopfraction}{0.9} +\renewcommand{\textfraction}{0.07} +\renewcommand{\floatpagefraction}{0.7} + \interfootnotelinepenalty=1000000 \makesavenoteenv{tabularx} @@ -64,42 +75,49 @@ \section{Specification} -\Ac{BED} is a whitespace-delimited file format, where each~\textbf{file} consists of one or more~\textbf{line}s.\footnote{``Frequently Asked Questions: Data File Formats.'' \ac{UCSC} Genome Browser FAQ, \url{https://genome.ucsc.edu/FAQ/FAQformat.html}} +\Ac{BED} is a whitespace-delimited file format, where each~\textbf{file} consists of zero or more~\textbf{line}s.\footnote{``Frequently Asked Questions: Data File Formats.'' \ac{UCSC} Genome Browser FAQ, \url{https://genome.ucsc.edu/FAQ/FAQformat.html}} Data are in~\textbf{data line}s, which describe discrete genomic~\textbf{feature}s by physical start and end position on a linear~\textbf{chromosome}. The file extension for the \ac{BED} format is~\texttt{.bed}. -\subsection{Typographic conventions} +\subsection{Scope} -This document uses the following typographic conventions: +This specification formalizes reasonable interpretations of the \ac{UCSC} Genome Browser \ac{BED} description. +This specification also makes clear potential interoperability issues in the current format, which could be addressed in a future specification. -\vspace{2ex} +\subsection{Typographic conventions} -\noindent -\addstackgap[\baselineskip]{\begin{tabularx}{\textwidth}{r L L} - \toprule - Style & Meaning & Examples \\ - \midrule - Bold & Terms defined in subsections~\ref{sec:terms}--\ref{sec:lines} & \textbf{chromosome}{\quad}\textbf{file} \\ - Sans serif & Names of~\textbf{field}s & \textsf{chrom}{\quad}\textsf{chromStart}{\quad}\textsf{chromEnd} \\ - Fixed-width & Literals or \ac{regex}es\footnote{POSIX/IEEE~1003.1--2017 Extended Regular Expressions, for the ``C'' locale. - \emph{IEEE Standard for Information Technology---Portable Operating System Interface~(POSIX) Base Specifications}, IEEE~1003.1--2017, 2017} & \texttt{.bed}{\quad}\texttt{grep}{\quad}\texttt{[[:alnum:]]+}{\quad}\texttt{ATCG} \\ - \bottomrule -\end{tabularx}} +This document uses several typographic conventions~(\autoref{tab:typographic-conventions}). + +\begin{savenotes} + \begin{table} + \begin{tabularx}{\textwidth}{r L L} + \toprule + Style & Meaning & Examples \\ + \midrule + Bold & Terms defined in subsections~\ref{sec:terms}--\ref{sec:lines} & \textbf{chromosome}{\quad}\textbf{file} \\ + Sans serif & Names of~\textbf{field}s & \textsf{chrom}{\quad}\textsf{chromStart}{\quad}\textsf{chromEnd} \\ + Fixed-width & Literals or \ac{regex}es\footnote{POSIX/IEEE~1003.1--2017 Extended Regular Expressions, for the ``C'' locale. + \emph{IEEE Standard for Information Technology---Portable Operating System Interface~(POSIX) Base Specifications}, IEEE~1003.1--2017, 2017} & \texttt{.bed}{\quad}\texttt{grep}{\quad}\texttt{[[:alnum:]]+}{\quad}\texttt{ATCG} \\ + \bottomrule + \end{tabularx} + \caption{\textbf{Typographic conventions.}}\label{tab:typographic-conventions} + \end{table} +\end{savenotes} \subsection{Terminology and concepts}\label{sec:terms} \begin{description} -\item[0-start, half-open coordinate system:] +\item[0-based, half-open coordinate system:] A coordinate system where the first base starts at position~0, and the start of the interval is included but the end is not. For example, for a sequence of bases~\texttt{ACTGCG}, the bases given by the interval~[2,~4) are~\texttt{TG}. % chktex 9 -\item[BED$n$:] +\item[\acs{BED}$n$:] A~\textbf{file} with the first $n$~\textbf{field}s of the \ac{BED} format. For example, \textbf{BED3}~means a~\textbf{file} with only the first 3~\textbf{field}s; \textbf{BED12}~means a~\textbf{file} with all 12~\textbf{field}s. -\item[BED$n$+:] - A~\textbf{file} that has $n$~\textbf{field}s of the \ac{BED} format, followed by any number of~\textbf{field}s of custom data defined by a user. +\item[\acs{BED}$n$+:] + A~\textbf{file} that has the first $n$~\textbf{field}s of the \ac{BED} format, followed by any number of~\textbf{field}s of custom data defined by a user. -\item[BED$n$+$m$:] +\item[\acs{BED}$n$+$m$:] A~\textbf{file} that has a custom format starting with the first $n$~\textbf{field}s of the \ac{BED} format, followed by $m$~\textbf{field}s of custom data defined by a user. For example, \textbf{BED6+4}~means a~\textbf{file} with the first 6~\textbf{field}s of the \ac{BED} format, followed by 4~user-defined~\textbf{field}s. @@ -121,12 +139,14 @@ \subsection{Terminology and concepts}\label{sec:terms} \item[field:] Data stored as non-tab text. - All~\textbf{field}s are 7-bit US \ac{ASCII} printable characters\footnote{Characters in the range `\texttt{{\textbackslash}x20}' to `\texttt{{\textbackslash}x7e}', therefore not including any control characters}. - Only some \textbf{field}s can be empty, and they can only be empty when a single tab is used as the \textbf{field separator}. + All~\textbf{field}s are 7-bit US \ac{ASCII} printable characters\footnote{Characters in the range \texttt{{\textbackslash}x20} to \texttt{{\textbackslash}x7e}, therefore not including any control characters}. + Only custom \textbf{field}s can be empty, and they can only be empty when a single tab is used as the \textbf{field separator} throughout the \textbf{file}. \item[field separator:] One or more horizontal whitespace characters (space or tab). - The \textbf{field} separator must match the \ac{regex}~\texttt{[ {\textbackslash}]+}. + The \textbf{field} separator must match the \ac{regex}~\texttt{[ {\textbackslash}t]+}. + The \textbf{field} separator can vary throughout the \textbf{file}. + Some capabilities of the \ac{BED} format, however, are available only when a single tab is used as the \textbf{field} separator throughout the \textbf{file}. \item[file:] Sequence of one or more~\textbf{line}s. @@ -137,7 +157,7 @@ \subsection{Terminology and concepts}\label{sec:terms} Discussed more fully in~\autoref{sec:lines} \item[line separator:] - Either carriage return, newline, or carriage return followed by newline\footnote{A newline is defined as `\texttt{{\textbackslash}n}'} + Either carriage return~(\texttt{{\textbackslash}r}, equivalent to \texttt{{\textbackslash}x0d}), newline~(\texttt{{\textbackslash}r}, equivalent to \texttt{{\textbackslash}x0a}), or carriage return followed by newline~(\texttt{{\textbackslash}r{\textbackslash}n}, equivalent to \texttt{{\textbackslash}x0d{\textbackslash}x0a}). The same \textbf{line separator} must be used throughout the \textbf{file}. \end{description} @@ -161,48 +181,66 @@ \subsubsection{Comment lines and blank lines} \subsection{\acs{BED} fields} -Each~\textbf{data line} contains between~3 and 12~~\textbf{field}s delimited by a \textbf{field separator}. -The first 3~\textbf{field}s are mandatory, and the last 9~\textbf{field}s are optional. -In optional~\textbf{field}s, the order is binding---if 1~\textbf{field} is filled, then all previous~\textbf{field}s must also be filled. -However, \textbf{BED10} and \textbf{BED11} are prohibited. +Each~\textbf{data line} contains between~3 and 12~\textbf{field}s delimited by a \textbf{field separator}. +The first 3~\textbf{field}s are mandatory, and the last 9~\textbf{field}s are optional~(\autoref{tab:fields}). +In optional~\textbf{field}s, the order is binding---if a \textbf{field} is filled, then all previous~\textbf{field}s must also be filled. +Any mandatory or optional \textbf{field} included on any \textbf{data line} in the \textbf{file} must not be empty on any other \textbf{data line}. +\textbf{BED10} and \textbf{BED11} are prohibited. + +\begin{savenotes} + \begin{table} + \begin{adjustwidth}{-0.5in}{-0.5in} + \begin{tabularx}{\linewidth}{r l l l L} + \toprule + Col & Field & Type & Regex or range & Brief description \\ + \midrule + 1 + & \textsf{chrom} + & String + & \texttt{[[:alnum:]\_]\{1,255\}}\footnote{\texttt{[[:alnum:]\_]} is equivalent to the \ac{regex} \texttt{[A-Za-z0-9\_]}. % chktex 8 + It is also equivalent to the Perl extension \texttt{[[:word:]]}} + & \textbf{Chromosome} name \\ + + 2 & \textsf{chromStart} & Int & $[0, 2^{64}-1]$ & \textbf{Feature} start position \\ + 3 & \textsf{chromEnd} & Int & $[0, 2^{64} -1]$ & \textbf{Feature} end position \\ + 4 & \textsf{name} & String & \texttt{[{\textbackslash}x20-{\textbackslash}x7e]\{1,255\}} & \textbf{Feature} description \\ + 5 & \textsf{score} & Int & $[0, 1000]$ & A numerical value \\ + 6 & \textsf{strand} & String & \texttt{[-+.]} & \textbf{Feature} strand \\ + 7 & \textsf{thickStart} & Int & $[0, 2^{64}-1]$ & Thick start position \\ + 8 & \textsf{thickEnd} & Int & $[0, 2^{64}-1]$ & Thick end position \\ + 9 & \textsf{itemRgb} & Int,Int,Int & \texttt{(}$[0, 255], [0,255], [0,255]$\texttt{) | 0} & Display color \\ % chktex 9 + + 10 + & \textsf{blockCount} + & Int + & $[0, \textsf{chromEnd}-\textsf{chromStart}]$\footnote{\textsf{chromEnd}-\textsf{chromStart} is the maximum number of~\textbf{block}s that may exist without overlaps} + & Number of \textbf{block}s \\ + + 11 + & \textsf{blockSizes} + & List[Int] + & \texttt{([[:digit:]]+,)\{\textsf{blockCount}$-1$\}[[:digit:]]+,?}\footnote{For example, if~$\textsf{blockCount} = 4$, then the allowed \ac{regex} would be~\texttt{([[:digit:]]+,)\{3\}[[:digit:]]+,?}} + & \textbf{Block} sizes \\ + + 12 & \textsf{blockStarts} & List[Int] & \texttt{([[:digit:]]+,)\{\textsf{blockCount}$-1$\}[[:digit:]]+,?} & \textbf{Block} start positions \\ + \bottomrule + \end{tabularx} + \end{adjustwidth} + \caption{\textbf{Fields.}}\label{tab:fields} + \end{table} +\end{savenotes} In a \ac{BED}~\textbf{file}, each~\textbf{data line} must have the same number of~\textbf{field}s. The positions in \ac{BED}~\textbf{field}s are all described in the~\textbf{0-based, half-open coordinate system}. -\begin{adjustwidth}{-0.5in}{-0.5in} - \noindent - \addstackgap[\baselineskip]{\begin{tabularx}{\linewidth}{r l l l L} - \toprule - Col & Field & Type & Regex or range & Brief description \\ - \midrule - 1 & \textsf{chrom} & String & \texttt{[[:alnum:]\_]\{1,255\}}{\footnotemark} & \textbf{Chromosome} name \\ - 2 & \textsf{chromStart} & Int & $[0, 2^{64}-1]$ & \textbf{Feature} start position \\ - 3 & \textsf{chromEnd} & Int & $[0, 2^{64} -1]$ & \textbf{Feature} end position \\ - 4 & \textsf{name} & String & \texttt{[{\textasciicircum}{\textbackslash}t]\{1,255\}} & \textbf{Feature} description \\ - 5 & \textsf{score} & Int & $[0, 1000]$ & A numerical value \\ - 6 & \textsf{strand} & String & \texttt{[-+.]} & \textbf{Feature} strand \\ - 7 & \textsf{thickStart} & Int & $[0, 2^{64}-1]$ & Thick start position \\ - 8 & \textsf{thickEnd} & Int & $[0, 2^{64}-1]$ & Thick end position \\ - 9 & \textsf{itemRgb} & Int,Int,Int & \texttt{(}$[0, 255], [0,255], [0,255]$\texttt{) | 0} & Display color \\ % chktex 9 - 10 & \textsf{blockCount} & Int & $[0, \textsf{chromEnd}-\textsf{chromStart}]${\footnotemark} & Number of \textbf{block}s \\ - 11 & \textsf{blockSizes} & List[Int] & \texttt{([[:digit:]]+,)\{\textsf{blockCount}$-1$\}[[:digit:]]+,?}{\footnotemark} & \textbf{Block} sizes \\ - 12 & \textsf{blockStarts} & List[Int] & \texttt{([[:digit:]]+,)\{\textsf{blockCount}$-1$\}[[:digit:]]+,?} & \textbf{Block} start positions \\ - \bottomrule - \end{tabularx}} - \footnotetext[5]{\texttt{[[:alnum:]\_]} is equivalent to the \ac{regex} \texttt{[A-Za-z0-9\_]}. % chktex 8 - It is also equivalent to the Perl extension \texttt{[[:word:]]}.} - \footnotetext[6]{\textsf{chromEnd}-\textsf{chromStart} is the maximum number of~\textbf{block}s that may exist without overlaps.} - \footnotetext{For example, if~$\textsf{blockCount} = 4$, then the allowed \ac{regex} would be~\texttt{([[:digit:]]+,)\{3\}[[:digit:]]+,?}} -\end{adjustwidth} - \subsection{Coordinates} \begin{enumerate} \item \textsf{chrom}: The name of the~\textbf{chromosome} where the~\textbf{feature} is present. - Limiting only to word characters only, instead of all non-whitespace characters, makes \ac{BED}~\textbf{file}s more portable to varying environments which may make different assumptions about allowed characters. + Limiting to word characters only, instead of all non-whitespace printable characters, makes \ac{BED}~\textbf{file}s more portable to varying environments which may make different assumptions about allowed characters. The name must be between~1 and 255~characters long, inclusive. \item \textsf{chromStart}: Start position of the~\textbf{feature} on the~\textbf{chromosome}. - \textsf{chromStart}~must be an integer greater than or equal to~0 and less than the total number of bases of the~\textbf{chromosome} to which it belongs. + \textsf{chromStart}~must be an integer greater than or equal to~0 and less than or equal to the total number of bases of the~\textbf{chromosome} to which it belongs. If the size of the~\textbf{chromosome} is unknown, then \textsf{chromStart}~must be less than or equal to~$2^{64} - 1$, which is the maximum size of an unsigned 64-bit integer. \item \textsf{chromEnd}: End position of the~\textbf{feature} on the~\textbf{chromosome}. @@ -218,18 +256,19 @@ \subsection{Simple attributes} \item \textsf{name}: String that describes the~\textbf{feature}. \textsf{name} must be~1 to 255~non-tab characters. - \textsf{name} must not be empty or contain whitespace, unless the only \textbf{field separator} is a single tab. + \textsf{name} must not contain whitespace, unless the only \textbf{field separator} is a single tab. + Multiple \textbf{data line}s may share the same \textsf{name}. + In \textbf{BED5+} \textbf{file}s where all \textbf{features} have uninformative \textsf{name}s, dot~(\texttt{.}) may be used as a \textsf{name} on every \textbf{data line}. A visual representation of the \ac{BED} format may display \textsf{name} next to the~\textbf{feature}. \item \textsf{score}: Integer between~0 and~1000, inclusive. - In~\textbf{BED6+} \textbf{file}s where all \textbf{feature}s have uninformative \textsf{score}s, \texttt{0} should be used as the \textsf{score} on every \textbf{data line}. + In \textbf{BED6+} \textbf{file}s where all \textbf{feature}s have uninformative \textsf{score}s, \texttt{0} should be used as the \textsf{score} on every \textbf{data line}. A visual representation of the \ac{BED} format may shade \textbf{feature}s differently depending on their \textsf{score}. \item \textsf{strand}: Strand that the~\textbf{feature} appears on. The \textsf{strand} may either refer to the~\texttt{+}~(sense or coding) strand or the~\texttt{-}~(antisense or complementary) strand. - If the~\textbf{feature} has no \textsf{strand} information or unknown \textsf{strand}, then a dot~(\texttt{.}) must be used as a default value. - \textsf{strand} cannot be empty in \textbf{BED6+} \textbf{file}s. - A parser should treat \textbf{files} that are not \textbf{BED6+} as if \textsf{strand} where \texttt{.}. + If the \textbf{feature} has no \textsf{strand} information or unknown \textsf{strand}, then a dot~(\texttt{.}) must be used as an uninformative value. + \textsf{strand} should be treated as \texttt{.} when parsing files that are not \textbf{BED6+}. \end{enumerate} \subsection{Display attributes} @@ -238,19 +277,21 @@ \subsection{Display attributes} \item \textsf{thickStart}: Start position at which the~\textbf{feature} is visualized with a thicker or accented display. This value must be an integer between~\textsf{chromStart} and~\textsf{chromEnd}, inclusive. - There is no specified default value for~\textsf{thickStart}. + In \textbf{BED7+} \textbf{file}s where all \textbf{feature}s have uninformative \textsf{thickStart}s, the value of \textsf{chromStart} should be used as the \textsf{thickStart} on every \textbf{data line}. \item \textsf{thickEnd}: End position at which the~\textbf{feature} is visualized with a thicker or accented display. This value must be an integer greater than or equal to~\textsf{thickStart} and less than or equal to~\textsf{chromEnd}, inclusive. - In \ac{BED} \textbf{file}s with fewer than 7~\textbf{field}s, the whole~\textbf{feature} has thick display. + In \textbf{BED8+} \textbf{file}s where all \textbf{feature}s have uninformative \textsf{thickEnd}s, the value of \textsf{chromEnd} should be used as the \textsf{thickEnd} on every \textbf{data line}. + In \ac{BED} \textbf{file}s that are not \textbf{BED7+}, the whole~\textbf{feature} has thick display. In \textbf{BED7+}~\textbf{file}s, to achieve the same effect, set \textsf{thickStart}~equal to~\textsf{chromStart} and \textsf{thickEnd}~equal to~\textsf{chromEnd}. - If this~\textbf{field} is not specified but \textsf{thickStart}~is, then the entire~\textbf{feature} has thick display. - For \textbf{BED7+} \textbf{file}s that are not \textbf{BED8+}, there is no specified default value for~\textsf{thickEnd}. + If \textsf{thickEnd} is not specified but \textsf{thickStart}~is, then the entire~\textbf{feature} has thick display. \item \textsf{itemRgb}: A triple of integers that determines the color of this~\textbf{feature} when visualized. The triple is three integers separated by commas. Each integer is between~0 and~255, inclusive. To make a~\textbf{feature} black, \textsf{itemRgb}~may be a single~\texttt{0}, which is visualized identically to a~\textbf{feature} with \textsf{itemRgb} of \texttt{0,0,0}. + An \textsf{itemRgb} of~\texttt{0} is a special case and no other single-number value is valid. + In \textbf{BED9+} \textbf{file}s where all \textbf{feature}s have uninformative \textsf{itemRgb}s, \texttt{0} should be used as the \textsf{itemRgb} on every \textbf{data line}. \end{enumerate} \subsection{Blocks} @@ -260,14 +301,12 @@ \subsection{Blocks} \item \textsf{blockCount}: Number of~\textbf{block}s in the~\textbf{feature}. \textsf{blockCount}~must be an integer greater than 0. \textsf{blockCount}~is mandatory in~\textbf{BED12+}~\textbf{file}s. - Empty~\textsf{blockCount} are not allowed, because~\textsf{blockSizes} and~\textsf{blockStarts} rely on~\textsf{blockCount}. A visual representation of the \ac{BED} format may have blocks appear thicker than the rest of the~\textbf{feature}. \item \textsf{blockSizes}: Comma-separated list of length~\textsf{blockCount} containing the size of each~\textbf{block}. There must be no spaces before or after commas. There may be a trailing comma after the last element of the list. \textsf{blockSizes}~is mandatory in \textbf{BED12+} \textbf{file}s. - Empty~\textsf{blockSizes} is not allowed, because \textsf{blockStarts}~cannot be verified without~\textsf{blockSizes}. \item \textsf{blockStarts}: Comma-separated list of length~\textsf{blockCount} containing each \textbf{block}'s~start position, relative to~\textsf{chromStart}. There must not be spaces before or after the commas. @@ -280,7 +319,6 @@ \subsection{Blocks} Moreover, the~\textbf{block}s must not overlap. The list must be sorted in ascending order. \textsf{blockStarts}~is mandatory in~\textbf{BED12+} \textbf{file}s. - Empty~\textsf{blockStarts} is not allowed. \end{enumerate} \section{Examples} @@ -323,8 +361,7 @@ \subsection{Mandatory fields} \subsection{Optional fields}\label{sec:optional} \begin{itemize} -\item \textsf{name}: If a \textbf{feature} has no name, then a dot~(\texttt{.}) should be used. - Names should avoid using the space character even if the only \textbf{field separator} is a single tab character, because parsers may interpret a space as a \textbf{field separator}. +\item \textsf{name}: Names should avoid using the space character even if the only \textbf{field separator} is a single tab character, because parsers may interpret a space as a \textbf{field separator}. \item \textsf{itemRgb}: Eight or fewer colors should be used as too many colors may slow down visualizations and are difficult for humans to distinguish.\footnote{``Frequently Asked Questions: Data File Formats.'' \ac{UCSC} Genome Browser FAQ, @@ -339,20 +376,25 @@ \subsection{User-defined fields} Custom data \textbf{fields} may contain any printable 7-bit US \ac{ASCII} character (which includes spaces, but excludes tabs, newlines, and other control characters). Custom data \textbf{fields} can only be empty when a single tab is used as the \textbf{field separator} throughout the \textbf{file}. Definitions of a custom \ac{BED} format should restrict the type of each \textbf{field} to the extent possible. -Each custom \textbf{field} should contain either one of the following data types or a comma-separated list of values of the same type: - -\noindent -\addstackgap[\baselineskip]{\begin{tabularx}{\textwidth}{r L} - \toprule - Type & Definition \\ - \midrule - Integer & String representation of 64-bit signed integer\footnote{\emph{IEEE 754--1985 IEEE Standard for Binary Floating-Point Arithmetic.} IEEE 754--1985, 1985} \\ - Unsigned & String representation of 64-bit unsigned integer\footnotemark[10] \\ - Float & String representation of 64-bit floating point number\footnotemark[10] \\ - Character & One character, other than tab \\ - String & One or more characters, other than tab \\ - \bottomrule -\end{tabularx}} +Each custom \textbf{field} should contain either one of several specified data types~(\autoref{tab:custom-data-types}) or a comma-separated list of values of any type other than String. + +\begin{savenotes} + \begin{table} + \begin{tabularx}{\textwidth}{r L} + \toprule + Type & Definition \\ + \midrule + Integer & Decimal string representation of 64-bit signed integer \\ + Unsigned & Decimal string representation of 64-bit unsigned integer \\ + Float & Decimal string representation of 64-bit floating point number\footnote{\emph{IEEE Standard for Binary Floating-Point Arithmetic.} + IEEE 754--1985, 1985} \\ + Character & One character, other than tab \\ + String & One or more characters, other than tab \\ + \bottomrule + \end{tabularx} + \caption{\textbf{Custom field data types.}}\label{tab:custom-data-types} + \end{table} +\end{savenotes} This specification does not contain a means for interchanging custom \ac{BED} format definitions. The AutoSQL format\footnote{Kent, W.~James. @@ -363,14 +405,24 @@ \subsection{Sorting} \Ac{BED} \textbf{file}s should be sorted by~\textsf{chrom}, then by~\textsf{chromStart} numerically, and finally by~\textsf{chromEnd} numerically. \textsf{chrom} may be sorted using any scheme (such as lexicographic or numeric order), but all \textbf{data line}s with the same~\textsf{chrom} value should occur consecutively. For example, the lexicographic order of~\texttt{chr1}, \texttt{chr10}, \texttt{chr11}, \texttt{chr12}, {\ldots}, \texttt{chr2}, \texttt{chr20}, \texttt{chr21}, {\ldots}, \texttt{chr3}, {\ldots}, \texttt{chrX}, \texttt{chrY}, \texttt{chrM} is an acceptable sorting. +This ordering is equivalent to sorting the \textbf{file} using the command \verb|LC\_ALL=C| \verb|sort| \verb|-k 1,1| \verb|-k 2,2n| \verb|-k 3,3n|. The numeric order of~\texttt{chr1}, \texttt{chr2}, {\ldots}, \texttt{chr21}, \texttt{chr22}, \texttt{chrM}, \texttt{chrX}, \texttt{chrY} is also acceptable. Arbitrary orderings of~\textsf{chrom} are allowed, but regardless of the \textbf{chromosome} sorting scheme, \textbf{data line}s for two \textbf{feature}s on the same \textbf{chromosome} should not have any \textbf{data line}s for \textbf{feature}s on other \textbf{chromosome}s between them. Multiple \textbf{feature}s that have the same~\textsf{chrom}, \textsf{chromStart}, and \textsf{chromEnd} can appear in any order. +\textbf{Comment line}s and \textbf{blank line}s do not have to be sorted according to the schemes mentioned. + +Sorting is recommended because the implementation of downstream operations is easier if features of one chromosome are all grouped together and \textsf{chromStart} is non-decreasing within a chromosome. + +For \textbf{BED4+} files, a sorting scheme may also order by optional \textbf{field}s and any custom \textbf{field}s. +A recommendation for how to do this is outside the scope of this version of the specification. +Total deterministic sorting of \ac{BED} \textbf{file}s can prevent downstream analyses from producing different results depending on sort order. \subsection{Whitespace}\label{sec:whitespace} -Though \textbf{data line}s may use any kind of horizontal whitespace as a delimiter between~\textbf{field}s, a single tab~(\texttt{{\textbackslash}t}) should be used. +We recommend that only a single tab~(\texttt{{\textbackslash}t}) be used as \textbf{field separator}. This is because almost all tools support tabs while some tools do not support other kinds of whitespace. -Also, spaces within the~\textsf{name}~\textbf{field} may be used only if the \textbf{field}~delimiter is tab throughout the \textbf{file}. +Also, spaces within the~\textsf{name}~\textbf{field} may be used only if the \textbf{field separator} is tab throughout the \textbf{file}. + +It would be sensible for future major versions of this specification or overlay formats built on top of this specification to require that only a single tab be used as \textbf{field separator}. \subsection{Large \acs{BED} files} If a~\textbf{file} intended for visualization is over \SI{50}{\mebi\byte} in size, the~\textbf{file} should be converted to~\texttt{bigBed} format, which is an indexed binary format.\footnote{Kent, W.~James et al. @@ -380,15 +432,21 @@ \subsection{Large \acs{BED} files} The~\texttt{bedToBigBed} program may perform this conversion.\footnote{``bigBed Track Format.'' \Ac{UCSC} Genome Browser FAQ, \url{https://genome.ucsc.edu/goldenPath/help/bigBed.html}} +Tabix is another option for storing larger \ac{BED} \textbf{file}s.\footnote{Li H. + (2011) ``Tabix: fast retrieval of sequence features from generic TAB-delimited files.'' + \emph{Bioinformatics} 27(5):718--719. + \url{https://doi.org/10.1093/bioinformatics/btq671}} +Tabix works only on \textbf{file}s using a single tab as the \textbf{field separator}. + \section{Information supplied out-of-band} Some information about a \ac{BED} \textbf{file} can only be supplied unambiguously separately from the \textbf{data line}s of the \ac{BED} \textbf{file}. This specification does not contain a means for interchanging this information. Information that must be supplied out-of-band include: -\begin{itemize} +\begin{itemize}[noitemsep] \item Which of the first~4 to 12~\textbf{fields} are standard \ac{BED} \textbf{fields} and which are custom \textbf{field}s. - \item The genome assembly that define \textsf{chrom}, \textsf{chromStart}, and \textsf{chromEnd}. + \item The genome assembly that defines \textsf{chrom}, \textsf{chromStart}, and \textsf{chromEnd}. \item The semantics of \textbf{fields} such as \textsf{score}, \textsf{itemRgb}, thick vs.~thin positions, and block vs.~non-block positions. \item The definitions of custom \textbf{field}s. \item Whether the \textbf{field separator} is a single tab character. @@ -397,17 +455,17 @@ \section{Information supplied out-of-band} \section{\acs{UCSC} track files} Track files are files that contain additional information intended for a visualization tool such as the \ac{UCSC} Genome Browser.\footnote{Haeussler, Maximilian et al. - (2019) ``The \acl{UCSC} Genome Browser database: 2019 update.'' + (2019) ``The \acs{UCSC} Genome Browser database: 2019 update.'' \emph{Nucleic Acids Research} 47(D1):D853--D858. \url{https://doi.org/10.1093/nar/gky1095}} Track files contain browser lines and track lines that precede lines from a file format supported by the Genome Browser.\footnote{``Displaying your own annotations in the Genome Browser.'' \ac{UCSC} Genome Browser FAQ, \url{https://genome.ucsc.edu/goldenPath/help/customTrack.html\#lines}} -Track files are not valid \ac{BED} \textbf{file}s --- valid \ac{BED} \textbf{file}s must not have any browser or track lines. +Track files are not valid \ac{BED} \textbf{file}s---valid \ac{BED} \textbf{file}s must not have any browser or track lines. To distinguish between \ac{BED} \textbf{file}s and track files, track files should use the file extension~\texttt{.track}. \section{Acronyms} % using the optional argument to acronym to set the label width causes it to use the list environment instead of description, which means we can't set nosep easily -\setlist[description]{labelwidth=\widthof{\textbf{GA4GH}},nosep} +\setlist[description]{labelwidth=\widthof{\textbf{\acs{GA4GH}}},nosep} \begin{acronym} \acro{ASCII}{American Standard Code for Information Interchange} \acro{BED}{Browser Extensible Data} @@ -421,8 +479,7 @@ \section{Acronyms} \section{Acknowledgments} We thank W.~James Kent and the \ac{UCSC} Genome Browser team for creating the \ac{BED} format. -We thank W.~James Kent and Hiram Clawson~(\ac{UCSC}); Eric Roberts~(University Health Network); John Marshall~(University of Glasgow); Aaron R.~Quinlan and Brent S.~Pedersen~(University of Utah); Ting Wang~(Washington University in St.~Louis); Daniel Perrett and Simon Brent (Wellcome Sanger Institute); Jasper Saris (Erasmus Medical Center); Zhenyu Zhang (University of Chicago); Andrew Yates (\ac{EMBL}---European Bioinformatics Institute); and the \ac{GA4GH} File Formats Task Team for comments on this specification. - +We thank W.~James Kent and Hiram Clawson~(\ac{UCSC}); Eric Roberts~(University Health Network); John Marshall~(University of Glasgow); Aaron R.~Quinlan and Brent S.~Pedersen~(University of Utah); Ting Wang~(Washington University in St.~Louis); Daniel Perrett and Simon Brent~(Wellcome Sanger Institute); Jasper Saris~(Erasmus Medical Center); Zhenyu Zhang (University of Chicago); Andrew Yates~(\ac{EMBL}---European Bioinformatics Institute); Michael Schatz~(Johns Hopkins University); Igor Dolgalev (New York University); Colin Diesh~(University of California, Berkeley); Alex Reynolds~(Altius Institute for Biomedical Sciences); Junjun Zhang~(Ontario Institute for Cancer Research); and the \ac{GA4GH} File Formats Task Team for comments on this specification. \end{document} diff --git a/Makefile b/Makefile index 94b61bf52..6c46ae0d0 100644 --- a/Makefile +++ b/Makefile @@ -61,6 +61,8 @@ NEW = diff/%.pdf: %.tex BIBINPUTS=:.. TEXINPUTS=:..:../new latexdiff-vc --pdf --dir diff --force --git --only-changes --graphics-markup=none --ignore-warnings --revision $(OLD) $(if $(NEW),--revision $(NEW)) $< +diff/BEDv1.pdf: BEDv1.tex + BIBINPUTS=:.. TEXINPUTS=:..:../new latexdiff-vc --config LATEX=lualatex --pdf --dir diff --force --git --only-changes --graphics-markup=none --ignore-warnings --revision $(OLD) $(if $(NEW),--revision $(NEW)) $< show-styles: @sed -n '/\\usepackage/s/.*{\(.*\)}$$/\1/p' *.tex | sort | uniq -c From dde256618ae133793ef9cfe276773024fca97300 Mon Sep 17 00:00:00 2001 From: Michael Hoffman Date: Tue, 31 Aug 2021 13:04:12 -0400 Subject: [PATCH 6/9] Further edits [20210831 draft] * replace `user` with more specific terms; addresses comment on #570 * move part of `Custom fields` description from recommendation to specification * add constraint on whitespace in custom fields; addresses comment on #570 * change definition of character and string to use printable characters * exclude Character and String from comma-separated lists * define `BED field` and `custom field` as terminology and use them when possible * clarify definition of BEDn+; addresses comment on #570 --- BEDv1.tex | 78 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 45 insertions(+), 33 deletions(-) diff --git a/BEDv1.tex b/BEDv1.tex index 92f26bedf..3e0db09f8 100644 --- a/BEDv1.tex +++ b/BEDv1.tex @@ -110,16 +110,22 @@ \subsection{Terminology and concepts}\label{sec:terms} A coordinate system where the first base starts at position~0, and the start of the interval is included but the end is not. For example, for a sequence of bases~\texttt{ACTGCG}, the bases given by the interval~[2,~4) are~\texttt{TG}. % chktex 9 +\item[\acs{BED} field:] + One of the 12~standard~\textbf{field}s defined in this specification. + The first 3~\textbf{\acs{BED} field}s are mandatory. + The remaining 9~\textbf{\acs{BED} field}s are optional. + \item[\acs{BED}$n$:] - A~\textbf{file} with the first $n$~\textbf{field}s of the \ac{BED} format. - For example, \textbf{BED3}~means a~\textbf{file} with only the first 3~\textbf{field}s; \textbf{BED12}~means a~\textbf{file} with all 12~\textbf{field}s. + A~\textbf{file} with the first $n$~\textbf{\acs{BED} field}s. + For example, \textbf{BED3}~means a~\textbf{file} with only the first 3~\textbf{\acs{BED} field}s; \textbf{BED12}~means a~\textbf{file} with all 12~\textbf{\acs{BED} field}s. \item[\acs{BED}$n$+:] - A~\textbf{file} that has the first $n$~\textbf{field}s of the \ac{BED} format, followed by any number of~\textbf{field}s of custom data defined by a user. + A~\textbf{file} that has at least the first $n$~\textbf{\acs{BED} field}s, followed by zero or more of the remaining~\textbf{\acs{BED} field}s and zero or more~\textbf{custom field}s. + A~\acs{BED}$n$ \textbf{file} also satisfies the definition of a \acs{BED}$n$+ \textbf{file}. \item[\acs{BED}$n$+$m$:] - A~\textbf{file} that has a custom format starting with the first $n$~\textbf{field}s of the \ac{BED} format, followed by $m$~\textbf{field}s of custom data defined by a user. - For example, \textbf{BED6+4}~means a~\textbf{file} with the first 6~\textbf{field}s of the \ac{BED} format, followed by 4~user-defined~\textbf{field}s. + A~\textbf{file} that has a custom format starting with the first $n$~\textbf{field}s of the \ac{BED} format, followed by $m$~\textbf{custom field}. + For example, \textbf{BED6+4}~means a~\textbf{file} with the first 6~\textbf{\acs{BED} field}s, followed by 4~custom~\textbf{field}s. \item[block:] Linear subfeatures within a~\textbf{feature}. @@ -133,6 +139,10 @@ \subsection{Terminology and concepts}\label{sec:terms} The name of each \textbf{chromosome} is often prefixed with~\texttt{chr}. Examples of \textbf{chromosome} names include~\texttt{chr1}, \texttt{21}, \texttt{chrX}, \texttt{chrM}, \texttt{chrUn}, \texttt{chr19\_KI270914v1\_alt}, and~\texttt{chrUn\_KI270435v1}. +\item[custom field:] + A~\textbf{field} defined by the \textbf{file}~creator. + \textbf{Custom field}s occur in each \textbf{line} after any \textbf{\acs{BED} field}s. + \item[feature:] A linear region of a~\textbf{chromosome} with specified properties. For example, a~\textbf{file}'s~\textbf{feature}s might all be peaks called from ChIP-seq data, or transcript. @@ -140,13 +150,12 @@ \subsection{Terminology and concepts}\label{sec:terms} \item[field:] Data stored as non-tab text. All~\textbf{field}s are 7-bit US \ac{ASCII} printable characters\footnote{Characters in the range \texttt{{\textbackslash}x20} to \texttt{{\textbackslash}x7e}, therefore not including any control characters}. - Only custom \textbf{field}s can be empty, and they can only be empty when a single tab is used as the \textbf{field separator} throughout the \textbf{file}. \item[field separator:] One or more horizontal whitespace characters (space or tab). - The \textbf{field} separator must match the \ac{regex}~\texttt{[ {\textbackslash}t]+}. - The \textbf{field} separator can vary throughout the \textbf{file}. - Some capabilities of the \ac{BED} format, however, are available only when a single tab is used as the \textbf{field} separator throughout the \textbf{file}. + The \textbf{field separator} must match the \ac{regex}~\texttt{[ {\textbackslash}t]+}. + The \textbf{field separator} can vary throughout the \textbf{file}. + Some capabilities of the \ac{BED} format, however, are available only when a single tab is used as the \textbf{field separator} throughout the \textbf{file}. \item[file:] Sequence of one or more~\textbf{line}s. @@ -154,7 +163,7 @@ \subsection{Terminology and concepts}\label{sec:terms} \item[line:] String terminated by a~\textbf{line separator}, in one of the following classes. Either a~\textbf{data line}, a~\textbf{comment line}, or a~\textbf{blank line}. - Discussed more fully in~\autoref{sec:lines} + Discussed more fully in~\autoref{sec:lines}. \item[line separator:] Either carriage return~(\texttt{{\textbackslash}r}, equivalent to \texttt{{\textbackslash}x0d}), newline~(\texttt{{\textbackslash}r}, equivalent to \texttt{{\textbackslash}x0a}), or carriage return followed by newline~(\texttt{{\textbackslash}r{\textbackslash}n}, equivalent to \texttt{{\textbackslash}x0d{\textbackslash}x0a}). @@ -181,10 +190,10 @@ \subsubsection{Comment lines and blank lines} \subsection{\acs{BED} fields} -Each~\textbf{data line} contains between~3 and 12~\textbf{field}s delimited by a \textbf{field separator}. -The first 3~\textbf{field}s are mandatory, and the last 9~\textbf{field}s are optional~(\autoref{tab:fields}). -In optional~\textbf{field}s, the order is binding---if a \textbf{field} is filled, then all previous~\textbf{field}s must also be filled. -Any mandatory or optional \textbf{field} included on any \textbf{data line} in the \textbf{file} must not be empty on any other \textbf{data line}. +Each~\textbf{data line} contains between~3 and 12~\textbf{\acs{BED} field}s delimited by a \textbf{field separator}. +The first 3~\textbf{\acs{BED} field}s are mandatory, and the last 9~\textbf{\acs{BED} field}s are optional~(\autoref{tab:fields}). +In optional~\textbf{\acs{BED} field}s, the order is binding---if an optional \textbf{\acs{BED} field} is filled, then all previous~\textbf{\acs{BED} field}s must also be filled. +Any \textbf{\acs{BED} field} included on any \textbf{data line} in the \textbf{file} must not be empty on any other \textbf{data line}. \textbf{BED10} and \textbf{BED11} are prohibited. \begin{savenotes} @@ -192,7 +201,7 @@ \subsection{\acs{BED} fields} \begin{adjustwidth}{-0.5in}{-0.5in} \begin{tabularx}{\linewidth}{r l l l L} \toprule - Col & Field & Type & Regex or range & Brief description \\ + Col & \acs{BED} Field & Type & Regex or range & Brief description \\ \midrule 1 & \textsf{chrom} @@ -226,12 +235,12 @@ \subsection{\acs{BED} fields} \bottomrule \end{tabularx} \end{adjustwidth} - \caption{\textbf{Fields.}}\label{tab:fields} + \caption{\textbf{\acs{BED} Fields.}}\label{tab:fields} \end{table} \end{savenotes} In a \ac{BED}~\textbf{file}, each~\textbf{data line} must have the same number of~\textbf{field}s. -The positions in \ac{BED}~\textbf{field}s are all described in the~\textbf{0-based, half-open coordinate system}. +The positions in \textbf{\acs{BED} field}s are all described in the~\textbf{0-based, half-open coordinate system}. \subsection{Coordinates} \begin{enumerate} @@ -258,7 +267,7 @@ \subsection{Simple attributes} \textsf{name} must be~1 to 255~non-tab characters. \textsf{name} must not contain whitespace, unless the only \textbf{field separator} is a single tab. Multiple \textbf{data line}s may share the same \textsf{name}. - In \textbf{BED5+} \textbf{file}s where all \textbf{features} have uninformative \textsf{name}s, dot~(\texttt{.}) may be used as a \textsf{name} on every \textbf{data line}. + In \textbf{BED5+} \textbf{file}s where all \textbf{feature}s have uninformative \textsf{name}s, dot~(\texttt{.}) may be used as a \textsf{name} on every \textbf{data line}. A visual representation of the \ac{BED} format may display \textsf{name} next to the~\textbf{feature}. \item \textsf{score}: Integer between~0 and~1000, inclusive. @@ -321,6 +330,12 @@ \subsection{Blocks} \textsf{blockStarts}~is mandatory in~\textbf{BED12+} \textbf{file}s. \end{enumerate} +\subsection{Custom fields} + +\textbf{Custom field}s defined by the \textbf{file}~creator may contain any printable 7-bit US \ac{ASCII} character (which includes spaces, but excludes tabs, newlines, and other control characters). +\textbf{Custom field}s may only be empty or contain whitespace when a single tab is used as the \textbf{field separator} throughout the \textbf{file}. +This specification does not contain a means for interchanging custom \ac{BED} format definitions. + \section{Examples} \subsection[title]{Example BED6 file from the \acs{UCSC} Genome Browser FAQ\footnote{``Frequently @@ -351,7 +366,7 @@ \subsection{Example BED12 file from the \acs{UCSC} Genome Browser FAQ} \section{Recommended practice for the \acs{BED} format} -\subsection{Mandatory fields} +\subsection{Mandatory \acs{BED} fields} \begin{itemize} \item \textsf{chrom}: The name of each~\textbf{chromosome} should also match the names from a reference genome, if applicable. For example, in the human genome, the \textbf{chromosome}s may be named~\texttt{chr1} to \texttt{chr22}, \texttt{chrX}, \texttt{chrY}, and~\texttt{chrM}. @@ -359,7 +374,7 @@ \subsection{Mandatory fields} For example, one should not use both~\texttt{17} and~\texttt{chr17} to represent the same~\textbf{chromosome} in the same~\textbf{file}. \end{itemize} -\subsection{Optional fields}\label{sec:optional} +\subsection{Optional \acs{BED} fields}\label{sec:optional} \begin{itemize} \item \textsf{name}: Names should avoid using the space character even if the only \textbf{field separator} is a single tab character, because parsers may interpret a space as a \textbf{field separator}. @@ -371,12 +386,10 @@ \subsection{Optional fields}\label{sec:optional} \end{itemize} -\subsection{User-defined fields} +\subsection{Custom fields} -Custom data \textbf{fields} may contain any printable 7-bit US \ac{ASCII} character (which includes spaces, but excludes tabs, newlines, and other control characters). -Custom data \textbf{fields} can only be empty when a single tab is used as the \textbf{field separator} throughout the \textbf{file}. -Definitions of a custom \ac{BED} format should restrict the type of each \textbf{field} to the extent possible. -Each custom \textbf{field} should contain either one of several specified data types~(\autoref{tab:custom-data-types}) or a comma-separated list of values of any type other than String. +Definitions of a custom \ac{BED} format should restrict the type of each \textbf{custom field} to the extent possible. +Each \textbf{custom field} should contain either one of several specified data types~(\autoref{tab:custom-data-types}) or a comma-separated list of Integer, Unsigned, or Float. \begin{savenotes} \begin{table} @@ -388,15 +401,14 @@ \subsection{User-defined fields} Unsigned & Decimal string representation of 64-bit unsigned integer \\ Float & Decimal string representation of 64-bit floating point number\footnote{\emph{IEEE Standard for Binary Floating-Point Arithmetic.} IEEE 754--1985, 1985} \\ - Character & One character, other than tab \\ - String & One or more characters, other than tab \\ + Character & One printable character \\ + String & One or more printable characters \\ \bottomrule \end{tabularx} \caption{\textbf{Custom field data types.}}\label{tab:custom-data-types} \end{table} \end{savenotes} -This specification does not contain a means for interchanging custom \ac{BED} format definitions. The AutoSQL format\footnote{Kent, W.~James. (2000) ``AutoSQL.'' \url{https://hgwdev.gi.ucsc.edu/~kent/exe/doc/autoSql.doc}} provides one method for defining custom \ac{BED} formats in a separate file. @@ -413,14 +425,14 @@ \subsection{Sorting} Sorting is recommended because the implementation of downstream operations is easier if features of one chromosome are all grouped together and \textsf{chromStart} is non-decreasing within a chromosome. -For \textbf{BED4+} files, a sorting scheme may also order by optional \textbf{field}s and any custom \textbf{field}s. +For \textbf{BED4+} files, a sorting scheme may also order by optional \textbf{\acs{BED} field}s and any \textbf{custom field}s. A recommendation for how to do this is outside the scope of this version of the specification. Total deterministic sorting of \ac{BED} \textbf{file}s can prevent downstream analyses from producing different results depending on sort order. \subsection{Whitespace}\label{sec:whitespace} We recommend that only a single tab~(\texttt{{\textbackslash}t}) be used as \textbf{field separator}. This is because almost all tools support tabs while some tools do not support other kinds of whitespace. -Also, spaces within the~\textsf{name}~\textbf{field} may be used only if the \textbf{field separator} is tab throughout the \textbf{file}. +Also, spaces within the~\textsf{name}~\textbf{\acs{BED} field} may be used only if the \textbf{field separator} is tab throughout the \textbf{file}. It would be sensible for future major versions of this specification or overlay formats built on top of this specification to require that only a single tab be used as \textbf{field separator}. @@ -445,10 +457,10 @@ \section{Information supplied out-of-band} Information that must be supplied out-of-band include: \begin{itemize}[noitemsep] - \item Which of the first~4 to 12~\textbf{fields} are standard \ac{BED} \textbf{fields} and which are custom \textbf{field}s. + \item Which of the first~4 to 12~\textbf{field}s are standard \textbf{\acs{BED} field}s and which are \textbf{custom field}s. \item The genome assembly that defines \textsf{chrom}, \textsf{chromStart}, and \textsf{chromEnd}. - \item The semantics of \textbf{fields} such as \textsf{score}, \textsf{itemRgb}, thick vs.~thin positions, and block vs.~non-block positions. - \item The definitions of custom \textbf{field}s. + \item The semantics of \textbf{field}s such as \textsf{score}, \textsf{itemRgb}, thick vs.~thin positions, and block vs.~non-block positions. + \item The definitions of \textbf{custom field}s. \item Whether the \textbf{field separator} is a single tab character. \end{itemize} From 636782d8ae5b926ebbcae7d850cd4d5e86d1bec8 Mon Sep 17 00:00:00 2001 From: Michael Hoffman Date: Thu, 23 Sep 2021 09:57:27 -0400 Subject: [PATCH 7/9] fix minor issues noted on #570 & newline typo [20210923 draft] --- BEDv1.tex | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/BEDv1.tex b/BEDv1.tex index 3e0db09f8..b9aaba0bb 100644 --- a/BEDv1.tex +++ b/BEDv1.tex @@ -124,9 +124,12 @@ \subsection{Terminology and concepts}\label{sec:terms} A~\acs{BED}$n$ \textbf{file} also satisfies the definition of a \acs{BED}$n$+ \textbf{file}. \item[\acs{BED}$n$+$m$:] - A~\textbf{file} that has a custom format starting with the first $n$~\textbf{field}s of the \ac{BED} format, followed by $m$~\textbf{custom field}. + A~\textbf{file} that has a custom format starting with the first $n$~\textbf{field}s of the \ac{BED} format, followed by $m$~\textbf{custom field}s. For example, \textbf{BED6+4}~means a~\textbf{file} with the first 6~\textbf{\acs{BED} field}s, followed by 4~custom~\textbf{field}s. +\item[blank line:] + A~\textbf{line} consisting entirely of horizontal whitespace. + \item[block:] Linear subfeatures within a~\textbf{feature}. Usually used to designate exons. @@ -139,10 +142,16 @@ \subsection{Terminology and concepts}\label{sec:terms} The name of each \textbf{chromosome} is often prefixed with~\texttt{chr}. Examples of \textbf{chromosome} names include~\texttt{chr1}, \texttt{21}, \texttt{chrX}, \texttt{chrM}, \texttt{chrUn}, \texttt{chr19\_KI270914v1\_alt}, and~\texttt{chrUn\_KI270435v1}. +\item[comment line:] + A~\textbf{line} that starts with~\texttt{\#} with no horizontal whitespace beforehand. + \item[custom field:] A~\textbf{field} defined by the \textbf{file}~creator. \textbf{Custom field}s occur in each \textbf{line} after any \textbf{\acs{BED} field}s. +\item[data line:] + A~\textbf{line} that contains \textbf{feature}~data. + \item[feature:] A linear region of a~\textbf{chromosome} with specified properties. For example, a~\textbf{file}'s~\textbf{feature}s might all be peaks called from ChIP-seq data, or transcript. @@ -166,7 +175,7 @@ \subsection{Terminology and concepts}\label{sec:terms} Discussed more fully in~\autoref{sec:lines}. \item[line separator:] - Either carriage return~(\texttt{{\textbackslash}r}, equivalent to \texttt{{\textbackslash}x0d}), newline~(\texttt{{\textbackslash}r}, equivalent to \texttt{{\textbackslash}x0a}), or carriage return followed by newline~(\texttt{{\textbackslash}r{\textbackslash}n}, equivalent to \texttt{{\textbackslash}x0d{\textbackslash}x0a}). + Either carriage return~(\texttt{{\textbackslash}r}, equivalent to \texttt{{\textbackslash}x0d}), newline~(\texttt{{\textbackslash}n}, equivalent to \texttt{{\textbackslash}x0a}), or carriage return followed by newline~(\texttt{{\textbackslash}r{\textbackslash}n}, equivalent to \texttt{{\textbackslash}x0d{\textbackslash}x0a}). The same \textbf{line separator} must be used throughout the \textbf{file}. \end{description} @@ -174,7 +183,7 @@ \subsection{Lines}\label{sec:lines} \subsubsection{Data lines} -\textbf{Data line}s contain \textbf{feature}~information. +\textbf{Data line}s contain \textbf{feature}~data. A \textbf{data line} is composed of~\textbf{field}s separated by \textbf{field separator}s. \subsubsection{Comment lines and blank lines} From 9985e36194e1e228251a352ac4752062eb00e7a9 Mon Sep 17 00:00:00 2001 From: Michael Hoffman Date: Tue, 2 Nov 2021 15:38:03 -0400 Subject: [PATCH 8/9] Makefile: switch `diff/BEDv1.pdf` target to use target-specific variables No longer have a hard-coded special command line for this target. Add `LATEXDIFF_ENGINE` variable to serve the same purpose as `LATEXMK_ENGINE` but for latexdiff. --- Makefile | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 6c46ae0d0..1453d18b9 100644 --- a/Makefile +++ b/Makefile @@ -39,6 +39,8 @@ LATEXMK = latexmk $(LATEXMK_ENGINE) $(LATEXMK_FLAGS) LATEXMK_ENGINE = --pdf --pdflatex='$(PDFLATEX)' LATEXMK_FLAGS = +LATEXDIFF_ENGINE = --config LATEX=pdflatex + new/%.pdf: %.tex $(LATEXMK) --output-directory=new $< @@ -59,10 +61,10 @@ OLD = HEAD NEW = diff/%.pdf: %.tex - BIBINPUTS=:.. TEXINPUTS=:..:../new latexdiff-vc --pdf --dir diff --force --git --only-changes --graphics-markup=none --ignore-warnings --revision $(OLD) $(if $(NEW),--revision $(NEW)) $< + BIBINPUTS=:.. TEXINPUTS=:..:../new latexdiff-vc $(LATEXDIFF_ENGINE) --pdf --dir diff --force --git --only-changes --graphics-markup=none --ignore-warnings --revision $(OLD) $(if $(NEW),--revision $(NEW)) $< + +diff/BEDv1.pdf: LATEXDIFF_ENGINE = --config LATEX=lualatex -diff/BEDv1.pdf: BEDv1.tex - BIBINPUTS=:.. TEXINPUTS=:..:../new latexdiff-vc --config LATEX=lualatex --pdf --dir diff --force --git --only-changes --graphics-markup=none --ignore-warnings --revision $(OLD) $(if $(NEW),--revision $(NEW)) $< show-styles: @sed -n '/\\usepackage/s/.*{\(.*\)}$$/\1/p' *.tex | sort | uniq -c From 2663cda8aac0da1ae37df5402ae2f520b7d2354f Mon Sep 17 00:00:00 2001 From: Michael Hoffman Date: Thu, 11 Nov 2021 13:13:11 -0500 Subject: [PATCH 9/9] `README.md` and `index.md`: add `BEDv1.tex` in new `Discrete genomic feature data files` heading --- README.md | 6 ++++++ index.md | 1 + 2 files changed, 7 insertions(+) diff --git a/README.md b/README.md index 569684d44..7b57ec04d 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,11 @@ These formats are discussed on the [vcftools-spec mailing list][vcfspec-ml]. **[BCFv2_qref.tex]** is a quick reference describing just the layout of data within BCF2 files. +Discrete genomic feature data files +----------------------------------- + +**[BEDv1.tex]** is the canonical specification for the GA4GH Browser Extensible Data (BED) format. + File encryption --------------- @@ -72,6 +77,7 @@ Transfer protocols [VCFv4.4.draft.tex]: http://samtools.github.io/hts-specs/VCFv4.4.draft.pdf [BCFv1_qref.tex]: http://samtools.github.io/hts-specs/BCFv1_qref.pdf [BCFv2_qref.tex]: http://samtools.github.io/hts-specs/BCFv2_qref.pdf +[BEDv1.tex]: https://samtools.github.io/hts-specs/BEDv1.pdf [crypt4gh.tex]: http://samtools.github.io/hts-specs/crypt4gh.pdf [Htsget.md]: http://samtools.github.io/hts-specs/htsget.html [Refget.md]: https://samtools.github.io/hts-specs/refget.html diff --git a/index.md b/index.md index ed12fc264..2ff60cd1d 100644 --- a/index.md +++ b/index.md @@ -25,6 +25,7 @@ Specifications: - [VCF v4.2](VCFv4.2.pdf) - [VCF v4.3](VCFv4.3.pdf) - [VCF v4.4 draft](VCFv4.4.draft.pdf) +- [BED v1](BEDv1.pdf) - [crypt4gh](crypt4gh.pdf) - [Htsget](htsget.html) - [Refget](refget.html)