Skip to content

Commit

Permalink
lab 8 and circleci
Browse files Browse the repository at this point in the history
  • Loading branch information
sergeyk committed Apr 5, 2021
1 parent f766653 commit 1f25a73
Show file tree
Hide file tree
Showing 163 changed files with 19,231 additions and 1,064 deletions.
22 changes: 7 additions & 15 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
# Python CircleCI 2.0 configuration file
#
# Check https://circleci.com/docs/2.0/language-python/ for more details
#
version: 2
jobs:
build:
docker:
- image: circleci/python:3.7
- image: circleci/python:3.6

steps:
- checkout

- restore_cache:
keys:
- cache-{{ checksum "requirements.txt" }}-{{ checksum "requirements-dev.txt" }}
- cache-{{ checksum "requirements/prod.txt" }}-{{ checksum "requirements/dev.txt" }}

- run:
name: Install Git LFS
Expand All @@ -29,39 +28,32 @@ jobs:
- run:
name: Install Shellcheck
command: |
curl -OL https://storage.googleapis.com/shellcheck/shellcheck-stable.linux.x86_64.tar.xz
curl -OL https://github.com/koalaman/shellcheck/releases/download/stable/shellcheck-stable.linux.x86_64.tar.xz
tar xf shellcheck-stable.linux.x86_64.tar.xz
sudo mv shellcheck-stable/shellcheck /usr/local/bin
working_directory: /tmp/shellcheck

- run:
name: install dependencies
command: |
sed -i 's/tensorflow==/tensorflow-cpu==/' requirements.txt
pip install -r requirements.txt
pip install -r requirements-dev.txt
pip install -r requirements/prod.txt -r requirements/dev.txt
- save_cache:
key: cache-{{ checksum "requirements.txt" }}-{{ checksum "requirements-dev.txt" }}
key: cache-{{ checksum "requirements/prod.txt" }}-{{ checksum "requirements/dev.txt" }}
paths:
- ~/.local

- run:
name: run linting
when: always
command: |
cd lab8 && PYTHONPATH=. ./tasks/lint.sh
cd lab8; ./tasks/lint.sh
- run:
name: run prediction tests
when: always
command: |
cd lab8 && PYTHONPATH=. pytest -s text_recognizer/tests/*
- run:
name: run evaluation tests
command: |
cd lab8 && PYTHONPATH=. pytest -s evaluation/*
cd lab8; ./tasks/test.sh
- store_artifacts:
path: test-reports
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,4 @@ _labs
logs
.mypy_cache
notebooks/lightning_logs
lightning_logs/
1 change: 1 addition & 0 deletions lab1/text_recognizer/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@




57 changes: 37 additions & 20 deletions lab1/text_recognizer/data/base_data_module.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
"""Base DataModule class."""
from pathlib import Path
from typing import Dict
from typing import Collection, Dict, Optional, Tuple, Union
import argparse
import os

from torch.utils.data import ConcatDataset, DataLoader
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from torchvision import transforms

from text_recognizer import util
from text_recognizer.data.util import BaseDataset


def load_and_print_info(data_module_class: type) -> None:
def load_and_print_info(data_module_class) -> None:
"""Load EMNISTLines and print info."""
parser = argparse.ArgumentParser()
data_module_class.add_to_argparse(parser)
Expand All @@ -26,7 +25,7 @@ def _download_raw_dataset(metadata: Dict, dl_dirname: Path) -> Path:
dl_dirname.mkdir(parents=True, exist_ok=True)
filename = dl_dirname / metadata["filename"]
if filename.exists():
return
return filename
print(f"Downloading raw dataset from {metadata['url']} to {filename}...")
util.download_url(metadata["url"], filename)
print("Computing SHA-256...")
Expand All @@ -52,12 +51,15 @@ def __init__(self, args: argparse.Namespace = None) -> None:
self.batch_size = self.args.get("batch_size", BATCH_SIZE)
self.num_workers = self.args.get("num_workers", NUM_WORKERS)

self.on_gpu = isinstance(self.args.get('gpus', None), (str, int))
self.on_gpu = isinstance(self.args.get("gpus", None), (str, int))

# Make sure to set the variables below in subclasses
self.dims = None
self.output_dims = None
self.mapping = None
self.dims: Tuple[int, ...]
self.output_dims: Tuple[int, ...]
self.mapping: Collection
self.data_train: Union[BaseDataset, ConcatDataset]
self.data_val: Union[BaseDataset, ConcatDataset]
self.data_test: Union[BaseDataset, ConcatDataset]

@classmethod
def data_dirname(cls):
Expand All @@ -77,26 +79,41 @@ def config(self):
"""Return important settings of the dataset, which will be passed to instantiate models."""
return {"input_dims": self.dims, "output_dims": self.output_dims, "mapping": self.mapping}

def prepare_data(self):
def prepare_data(self, *args, **kwargs) -> None:
"""
Use this method to do things that might write to disk or that need to be done only from a single GPU in distributed settings (so don't set state `self.x = y`).
Use this method to do things that might write to disk or that need to be done only from a single GPU
in distributed settings (so don't set state `self.x = y`).
"""
pass

def setup(self, stage=None):
def setup(self, stage: Optional[str] = None) -> None:
"""
Split into train, val, test, and set dims.
Should assign `torch Dataset` objects to self.data_train, self.data_val, and optionally self.data_test.
"""
self.data_train = None
self.data_val = None
self.data_test = None

def train_dataloader(self):
return DataLoader(self.data_train, shuffle=True, batch_size=self.batch_size, num_workers=self.num_workers, pin_memory=self.on_gpu)
return DataLoader(
self.data_train,
shuffle=True,
batch_size=self.batch_size,
num_workers=self.num_workers,
pin_memory=self.on_gpu,
)

def val_dataloader(self):
return DataLoader(self.data_val, shuffle=False, batch_size=self.batch_size, num_workers=self.num_workers, pin_memory=self.on_gpu)
return DataLoader(
self.data_val,
shuffle=False,
batch_size=self.batch_size,
num_workers=self.num_workers,
pin_memory=self.on_gpu,
)

def test_dataloader(self):
return DataLoader(self.data_test, shuffle=False, batch_size=self.batch_size, num_workers=self.num_workers, pin_memory=self.on_gpu)
return DataLoader(
self.data_test,
shuffle=False,
batch_size=self.batch_size,
num_workers=self.num_workers,
pin_memory=self.on_gpu,
)
11 changes: 6 additions & 5 deletions lab1/text_recognizer/data/mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@
DOWNLOADED_DATA_DIRNAME = BaseDataModule.data_dirname() / "downloaded"

# NOTE: temp fix until https://github.com/pytorch/vision/issues/1938 is resolved
from six.moves import urllib
from six.moves import urllib # pylint: disable=wrong-import-position, wrong-import-order

opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
opener.addheaders = [("User-agent", "Mozilla/5.0")]
urllib.request.install_opener(opener)


Expand All @@ -30,15 +31,15 @@ def __init__(self, args: argparse.Namespace) -> None:
self.output_dims = (1,)
self.mapping = list(range(10))

def prepare_data(self):
def prepare_data(self, *args, **kwargs) -> None:
"""Download train and test MNIST data from PyTorch canonical source."""
TorchMNIST(self.data_dir, train=True, download=True)
TorchMNIST(self.data_dir, train=False, download=True)

def setup(self, stage=None):
def setup(self, stage=None) -> None:
"""Split into train, val, test, and set dims."""
mnist_full = TorchMNIST(self.data_dir, train=True, transform=self.transform)
self.data_train, self.data_val = random_split(mnist_full, [55000, 5000])
self.data_train, self.data_val = random_split(mnist_full, [55000, 5000]) # type: ignore
self.data_test = TorchMNIST(self.data_dir, train=False, transform=self.transform)


Expand Down
2 changes: 1 addition & 1 deletion lab1/text_recognizer/data/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,6 @@ def split_dataset(base_dataset: BaseDataset, fraction: float, seed: int) -> Tupl
"""
split_a_size = int(fraction * len(base_dataset))
split_b_size = len(base_dataset) - split_a_size
return torch.utils.data.random_split(
return torch.utils.data.random_split( # type: ignore
base_dataset, [split_a_size, split_b_size], generator=torch.Generator().manual_seed(seed)
)
30 changes: 24 additions & 6 deletions lab1/text_recognizer/lit_models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,23 @@
ONE_CYCLE_TOTAL_STEPS = 100


class BaseLitModel(pl.LightningModule):
class Accuracy(pl.metrics.Accuracy):
"""Accuracy Metric with a hack."""

def update(self, preds: torch.Tensor, target: torch.Tensor) -> None:
"""
Metrics in Pytorch-lightning 1.2+ versions expect preds to be between 0 and 1 else fails with the ValueError:
"The `preds` should be probabilities, but values were detected outside of [0,1] range."
This is being tracked as a bug in https://github.com/PyTorchLightning/metrics/issues/60.
This method just hacks around it by normalizing preds before passing it in.
Normalized preds are not necessary for accuracy computation as we just care about argmax().
"""
if preds.min() < 0 or preds.max() > 1:
preds = torch.nn.functional.softmax(preds, dim=-1)
super().update(preds=preds, target=target)


class BaseLitModel(pl.LightningModule): # pylint: disable=too-many-ancestors
"""
Generic PyTorch-Lightning class that must be initialized with a PyTorch module.
"""
Expand All @@ -25,15 +41,15 @@ def __init__(self, model, args: argparse.Namespace = None):
self.lr = self.args.get("lr", LR)

loss = self.args.get("loss", LOSS)
if not loss in ("ctc", "transformer"):
if loss not in ("ctc", "transformer"):
self.loss_fn = getattr(torch.nn.functional, loss)

self.one_cycle_max_lr = self.args.get("one_cycle_max_lr", None)
self.one_cycle_total_steps = self.args.get("one_cycle_total_steps", ONE_CYCLE_TOTAL_STEPS)

self.train_acc = pl.metrics.Accuracy()
self.val_acc = pl.metrics.Accuracy()
self.test_acc = pl.metrics.Accuracy()
self.train_acc = Accuracy()
self.val_acc = Accuracy()
self.test_acc = Accuracy()

@staticmethod
def add_to_argparse(parser):
Expand All @@ -48,7 +64,9 @@ def configure_optimizers(self):
optimizer = self.optimizer_class(self.parameters(), lr=self.lr)
if self.one_cycle_max_lr is None:
return optimizer
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer=optimizer, max_lr=self.one_cycle_max_lr, total_steps=self.one_cycle_total_steps)
scheduler = torch.optim.lr_scheduler.OneCycleLR(
optimizer=optimizer, max_lr=self.one_cycle_max_lr, total_steps=self.one_cycle_total_steps
)
return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "val_loss"}

def forward(self, x):
Expand Down
43 changes: 6 additions & 37 deletions lab1/text_recognizer/util.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
"""Utility functions for text_recognizer module."""
from concurrent.futures import as_completed, ThreadPoolExecutor
from pathlib import Path
from typing import Union
from urllib.request import urlopen, urlretrieve
from urllib.request import urlretrieve

# import base64
import hashlib
import os

from PIL import Image
from tqdm import tqdm
import numpy as np
import cv2


def to_categorical(y, num_classes):
Expand All @@ -26,34 +25,6 @@ def read_image_pil(image_uri: Union[Path, str], grayscale=False) -> Image:
return image


def read_image(image_uri: Union[Path, str], grayscale=False) -> np.array:
"""Read image_uri."""

def read_image_from_filename(image_filename, imread_flag):
return cv2.imread(str(image_filename), imread_flag)

def read_image_from_url(image_url, imread_flag):
url_response = urlopen(str(image_url)) # nosec
img_array = np.array(bytearray(url_response.read()), dtype=np.uint8)
return cv2.imdecode(img_array, imread_flag)

imread_flag = cv2.IMREAD_GRAYSCALE if grayscale else cv2.IMREAD_COLOR
local_file = os.path.exists(image_uri)
try:
img = None
if local_file:
img = read_image_from_filename(image_uri, imread_flag)
else:
img = read_image_from_url(image_uri, imread_flag)
assert img is not None
except Exception as e:
raise ValueError("Could not load image at {}: {}".format(image_uri, e))
return img


def write_image(image: np.ndarray, filename: Union[Path, str]) -> None:
"""Write image to file."""
cv2.imwrite(str(filename), image)


def compute_sha256(filename: Union[Path, str]):
Expand All @@ -69,11 +40,11 @@ def update_to(self, blocks=1, bsize=1, tsize=None):
"""
Parameters
----------
blocks : int, optional
blocks: int, optional
Number of blocks transferred so far [default: 1].
bsize : int, optional
bsize: int, optional
Size of each block (in tqdm units) [default: 1].
tsize : int, optional
tsize: int, optional
Total size (in tqdm units). If [default: None] remains unchanged.
"""
if tsize is not None:
Expand All @@ -85,5 +56,3 @@ def download_url(url, filename):
"""Download a file from url to filename, with a progress bar."""
with TqdmUpTo(unit="B", unit_scale=True, unit_divisor=1024, miniters=1) as t:
urlretrieve(url, filename, reporthook=t.update_to, data=None) # nosec


8 changes: 4 additions & 4 deletions lab1/training/run_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def main():
data = data_class(args)
model = model_class(data_config=data.config(), args=args)

if args.loss not in ('ctc', 'transformer'):
if args.loss not in ("ctc", "transformer"):
lit_model_class = lit_models.BaseLitModel

if args.load_checkpoint is not None:
Expand All @@ -84,19 +84,19 @@ def main():

early_stopping_callback = pl.callbacks.EarlyStopping(monitor="val_loss", mode="min", patience=10)
model_checkpoint_callback = pl.callbacks.ModelCheckpoint(
filename='{epoch:03d}-{val_loss:.3f}-{val_cer:.3f}',
monitor="val_loss",
mode="min"
filename="{epoch:03d}-{val_loss:.3f}-{val_cer:.3f}", monitor="val_loss", mode="min"
)
callbacks = [early_stopping_callback, model_checkpoint_callback]

args.weights_summary = "full" # Print full summary of the model
trainer = pl.Trainer.from_argparse_args(args, callbacks=callbacks, logger=logger, weights_save_path="training/logs")

# pylint: disable=no-member
trainer.tune(lit_model, datamodule=data) # If passing --auto_lr_find, this will set learning rate

trainer.fit(lit_model, datamodule=data)
trainer.test(lit_model, datamodule=data)
# pylint: enable=no-member



Expand Down
2 changes: 2 additions & 0 deletions lab2/text_recognizer/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
# Hide lines below until Lab 2
from .emnist import EMNIST
from .emnist_lines import EMNISTLines

# Hide lines above until Lab 2



Loading

0 comments on commit 1f25a73

Please sign in to comment.