Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

build: add tool to create deterministic tarballs #2362

Merged
merged 8 commits into from
Jun 21, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions tensorboard/pip_package/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,26 @@ sh_binary(
],
)

py_binary(
name = "deterministic_tar_gz",
srcs = ["deterministic_tar_gz.py"],
srcs_version = "PY2AND3",
)

py_test(
name = "deterministic_tar_gz_test",
size = "medium",
timeout = "short",
srcs = ["deterministic_tar_gz_test.py"],
data = [
":deterministic_tar_gz", # invoked as subprocess
],
srcs_version = "PY2AND3",
deps = [
"//tensorboard:test",
],
)

genrule(
name = "license",
srcs = [
Expand Down
99 changes: 99 additions & 0 deletions tensorboard/pip_package/deterministic_tar_gz.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
#
# Much of this module is forked from Servo's `command_base.py`,
# originally written by Anton Ovchinnikov and also provided under the
# Apache-2.0 License:
#
# https://github.com/servo/servo/blob/d9fdf42bfe53dab08ba38fcdb349e84355f4cb3e/python/servo/command_base.py
# https://github.com/servo/servo/pull/12244
#
"""Generate `.tar.gz` archives deterministically.

Some differences from `tar czf ARCHIVE [FILES]...`:

- Timestamps (modification time, access time) are omitted.
- Owners and groups are omitted.
- Archive entries are not prefixed with the directory name or path.
- Dirnames are stripped.

See <https://reproducible-builds.org/docs/archives/>.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import gzip
import os
import sys
import tarfile


def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"archive",
metavar="ARCHIVE",
help="name for the output `.tar.gz` archive",
)
parser.add_argument(
"files",
metavar="files",
nargs="*",
help="files to include in the archive; basenames must be distinct",
)
args = parser.parse_args()
archive = args.archive
files = args.files
del args

if len(frozenset(os.path.basename(f) for f in files)) != len(files):
sys.stderr.write("Input basenames must be distinct; got: %r\n" % files)
sys.exit(1)

# (`fd` will be closed by `fdopen` context manager below)
fd = os.open(archive, os.O_WRONLY | os.O_CREAT, 0o644)
with \
os.fdopen(fd, "wb") as out_file, \
gzip.GzipFile("wb", fileobj=out_file, mtime=0) as gzip_file, \
tarfile.open(fileobj=gzip_file, mode="w:") as tar_file:
for f in files:
arcname = os.path.basename(f)
tar_file.add(f, filter=cleanse, recursive=False, arcname=arcname)


def cleanse(tarinfo):
"""Cleanse sources of nondeterminism from tar entries.

To be passed as the `filter` kwarg to `tarfile.TarFile.add`.

Args:
tarinfo: A `tarfile.TarInfo` object to be mutated.

Returns:
The same `tarinfo` object, but mutated.
"""
tarinfo.uid = 0
tarinfo.gid = 0
tarinfo.uname = "root"
tarinfo.gname = "root"
tarinfo.mtime = 0
return tarinfo


if __name__ == "__main__":
main()
112 changes: 112 additions & 0 deletions tensorboard/pip_package/deterministic_tar_gz_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""End-to-end tests for the `deterministic_tar_gz` tool."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import gzip
import os
import subprocess
import tarfile

from tensorboard import test as tb_test


class DeterministicTarGzTest(tb_test.TestCase):

def setUp(self):
self._tool_path = os.path.join(
os.path.dirname(os.environ["TEST_BINARY"]),
"deterministic_tar_gz",
)

def _run_tool(self, args):
return subprocess.check_output([self._tool_path] + args)

def _write_file(self, directory, filename, contents, utime=None):
"""Write a file and set its access and modification times.

Args:
directory: Path to parent directory for the file, as a `str`.
filename: Name of file inside directory, as a `str`.
contents: File contents, as a `str`.
utime: If not `None`, a 2-tuple of numbers (`int`s or `float`s)
representing seconds since epoch for `atime` and `mtime`,
respectively, as in the second argument to `os.utime`. Defaults
to a fixed value; the file's timestamps will always be set.

Returns:
The new file path.
"""
filepath = os.path.join(directory, filename)
with open(filepath, "w") as outfile:
outfile.write(contents)
if utime is None:
utime = (123, 456)
os.utime(filepath, utime)
return filepath

def test_correct_contents(self):
tempdir = self.get_temp_dir()
archive = os.path.join(tempdir, "out.tar.gz")
directory = os.path.join(tempdir, "src")
os.mkdir(directory)
self._run_tool([
archive,
self._write_file(directory, "1.txt", "one"),
self._write_file(directory, "2.txt", "two"),
])
with gzip.open(archive) as gzip_file:
with tarfile.open(fileobj=gzip_file, mode="r:") as tar_file:
self.assertEqual(tar_file.getnames(), ["1.txt", "2.txt"]) # in order
self.assertEqual(tar_file.extractfile("1.txt").read(), b"one")
self.assertEqual(tar_file.extractfile("2.txt").read(), b"two")

def test_invariant_under_mtime(self):
tempdir = self.get_temp_dir()

archive_1 = os.path.join(tempdir, "out_1.tar.gz")
directory_1 = os.path.join(tempdir, "src_1")
os.mkdir(directory_1)
self._run_tool([
archive_1,
self._write_file(directory_1, "1.txt", "one", utime=(1, 2)),
self._write_file(directory_1, "2.txt", "two", utime=(3, 4)),
])

archive_2 = os.path.join(tempdir, "out_2.tar.gz")
directory_2 = os.path.join(tempdir, "src_2")
os.mkdir(directory_2)
self._run_tool([
archive_2,
self._write_file(directory_2, "1.txt", "one", utime=(7, 8)),
self._write_file(directory_2, "2.txt", "two", utime=(5, 6)),
])

with open(archive_1, "rb") as infile:
archive_1_contents = infile.read()
with open(archive_2, "rb") as infile:
archive_2_contents = infile.read()

self.assertEqual(archive_1_contents, archive_2_contents)

def test_invariant_under_owner_and_group_names(self):
self.skipTest("Can't really test this; no way to chown.")


if __name__ == "__main__":
tb_test.main()