lab 8 and circleci

tangtianyi1998 · Apr 5, 2021 · 1f25a73 · 1f25a73
1 parent f766653
commit 1f25a73
Show file tree

Hide file tree

Showing 163 changed files with 19,231 additions and 1,064 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -1,19 +1,18 @@
 # Python CircleCI 2.0 configuration file
 #
 # Check https://circleci.com/docs/2.0/language-python/ for more details
-#
 version: 2
 jobs:
   build:
     docker:
-      - image: circleci/python:3.7
+      - image: circleci/python:3.6
 
     steps:
       - checkout
 
       - restore_cache:
           keys:
-          - cache-{{ checksum "requirements.txt" }}-{{ checksum "requirements-dev.txt" }}
+          - cache-{{ checksum "requirements/prod.txt" }}-{{ checksum "requirements/dev.txt" }}
 
       - run:
           name: Install Git LFS
@@ -29,39 +28,32 @@ jobs:
       - run:
           name: Install Shellcheck
           command: |
-            curl -OL https://storage.googleapis.com/shellcheck/shellcheck-stable.linux.x86_64.tar.xz
+            curl -OL https://github.com/koalaman/shellcheck/releases/download/stable/shellcheck-stable.linux.x86_64.tar.xz
             tar xf shellcheck-stable.linux.x86_64.tar.xz
             sudo mv shellcheck-stable/shellcheck /usr/local/bin
           working_directory: /tmp/shellcheck
 
       - run:
           name: install dependencies
           command: |
-            sed -i 's/tensorflow==/tensorflow-cpu==/' requirements.txt
-            pip install -r requirements.txt
-            pip install -r requirements-dev.txt
+            pip install -r requirements/prod.txt -r requirements/dev.txt
 
       - save_cache:
-          key: cache-{{ checksum "requirements.txt" }}-{{ checksum "requirements-dev.txt" }}
+          key: cache-{{ checksum "requirements/prod.txt" }}-{{ checksum "requirements/dev.txt" }}
           paths:
             - ~/.local
 
       - run:
           name: run linting
           when: always
           command: |
-            cd lab8 && PYTHONPATH=. ./tasks/lint.sh
+            cd lab8; ./tasks/lint.sh
 
       - run:
           name: run prediction tests
           when: always
           command: |
-            cd lab8 && PYTHONPATH=. pytest -s text_recognizer/tests/*
-
-      - run:
-          name: run evaluation tests
-          command: |
-            cd lab8 && PYTHONPATH=. pytest -s evaluation/*
+            cd lab8; ./tasks/test.sh
 
       - store_artifacts:
           path: test-reports

diff --git a/.gitignore b/.gitignore
@@ -44,3 +44,4 @@ _labs
 logs
 .mypy_cache
 notebooks/lightning_logs
+lightning_logs/
diff --git a/lab1/text_recognizer/data/__init__.py b/lab1/text_recognizer/data/__init__.py
@@ -4,3 +4,4 @@
 
 
 
+
diff --git a/lab1/text_recognizer/data/base_data_module.py b/lab1/text_recognizer/data/base_data_module.py
@@ -1,17 +1,16 @@
 """Base DataModule class."""
 from pathlib import Path
-from typing import Dict
+from typing import Collection, Dict, Optional, Tuple, Union
 import argparse
-import os
 
+from torch.utils.data import ConcatDataset, DataLoader
 import pytorch_lightning as pl
-from torch.utils.data import DataLoader
-from torchvision import transforms
 
 from text_recognizer import util
+from text_recognizer.data.util import BaseDataset
 
 
-def load_and_print_info(data_module_class: type) -> None:
+def load_and_print_info(data_module_class) -> None:
     """Load EMNISTLines and print info."""
     parser = argparse.ArgumentParser()
     data_module_class.add_to_argparse(parser)
@@ -26,7 +25,7 @@ def _download_raw_dataset(metadata: Dict, dl_dirname: Path) -> Path:
     dl_dirname.mkdir(parents=True, exist_ok=True)
     filename = dl_dirname / metadata["filename"]
     if filename.exists():
-        return
+        return filename
     print(f"Downloading raw dataset from {metadata['url']} to {filename}...")
     util.download_url(metadata["url"], filename)
     print("Computing SHA-256...")
@@ -52,12 +51,15 @@ def __init__(self, args: argparse.Namespace = None) -> None:
         self.batch_size = self.args.get("batch_size", BATCH_SIZE)
         self.num_workers = self.args.get("num_workers", NUM_WORKERS)
 
-        self.on_gpu = isinstance(self.args.get('gpus', None), (str, int))
+        self.on_gpu = isinstance(self.args.get("gpus", None), (str, int))
 
         # Make sure to set the variables below in subclasses
-        self.dims = None
-        self.output_dims = None
-        self.mapping = None
+        self.dims: Tuple[int, ...]
+        self.output_dims: Tuple[int, ...]
+        self.mapping: Collection
+        self.data_train: Union[BaseDataset, ConcatDataset]
+        self.data_val: Union[BaseDataset, ConcatDataset]
+        self.data_test: Union[BaseDataset, ConcatDataset]
 
     @classmethod
     def data_dirname(cls):
@@ -77,26 +79,41 @@ def config(self):
         """Return important settings of the dataset, which will be passed to instantiate models."""
         return {"input_dims": self.dims, "output_dims": self.output_dims, "mapping": self.mapping}
 
-    def prepare_data(self):
+    def prepare_data(self, *args, **kwargs) -> None:
         """
-        Use this method to do things that might write to disk or that need to be done only from a single GPU in distributed settings (so don't set state `self.x = y`).
+        Use this method to do things that might write to disk or that need to be done only from a single GPU
+        in distributed settings (so don't set state `self.x = y`).
         """
-        pass
 
-    def setup(self, stage=None):
+    def setup(self, stage: Optional[str] = None) -> None:
         """
         Split into train, val, test, and set dims.
         Should assign `torch Dataset` objects to self.data_train, self.data_val, and optionally self.data_test.
         """
-        self.data_train = None
-        self.data_val = None
-        self.data_test = None
 
     def train_dataloader(self):
-        return DataLoader(self.data_train, shuffle=True, batch_size=self.batch_size, num_workers=self.num_workers, pin_memory=self.on_gpu)
+        return DataLoader(
+            self.data_train,
+            shuffle=True,
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
+            pin_memory=self.on_gpu,
+        )
 
     def val_dataloader(self):
-        return DataLoader(self.data_val, shuffle=False, batch_size=self.batch_size, num_workers=self.num_workers, pin_memory=self.on_gpu)
+        return DataLoader(
+            self.data_val,
+            shuffle=False,
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
+            pin_memory=self.on_gpu,
+        )
 
     def test_dataloader(self):
-        return DataLoader(self.data_test, shuffle=False, batch_size=self.batch_size, num_workers=self.num_workers, pin_memory=self.on_gpu)
+        return DataLoader(
+            self.data_test,
+            shuffle=False,
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
+            pin_memory=self.on_gpu,
+        )
diff --git a/lab1/text_recognizer/data/mnist.py b/lab1/text_recognizer/data/mnist.py
@@ -10,9 +10,10 @@
 DOWNLOADED_DATA_DIRNAME = BaseDataModule.data_dirname() / "downloaded"
 
 # NOTE: temp fix until https://github.com/pytorch/vision/issues/1938 is resolved
-from six.moves import urllib
+from six.moves import urllib  # pylint: disable=wrong-import-position, wrong-import-order
+
 opener = urllib.request.build_opener()
-opener.addheaders = [('User-agent', 'Mozilla/5.0')]
+opener.addheaders = [("User-agent", "Mozilla/5.0")]
 urllib.request.install_opener(opener)
 
 
@@ -30,15 +31,15 @@ def __init__(self, args: argparse.Namespace) -> None:
         self.output_dims = (1,)
         self.mapping = list(range(10))
 
-    def prepare_data(self):
+    def prepare_data(self, *args, **kwargs) -> None:
         """Download train and test MNIST data from PyTorch canonical source."""
         TorchMNIST(self.data_dir, train=True, download=True)
         TorchMNIST(self.data_dir, train=False, download=True)
 
-    def setup(self, stage=None):
+    def setup(self, stage=None) -> None:
         """Split into train, val, test, and set dims."""
         mnist_full = TorchMNIST(self.data_dir, train=True, transform=self.transform)
-        self.data_train, self.data_val = random_split(mnist_full, [55000, 5000])
+        self.data_train, self.data_val = random_split(mnist_full, [55000, 5000])  # type: ignore
         self.data_test = TorchMNIST(self.data_dir, train=False, transform=self.transform)
 
 

diff --git a/lab1/text_recognizer/data/util.py b/lab1/text_recognizer/data/util.py
@@ -87,6 +87,6 @@ def split_dataset(base_dataset: BaseDataset, fraction: float, seed: int) -> Tupl
     """
     split_a_size = int(fraction * len(base_dataset))
     split_b_size = len(base_dataset) - split_a_size
-    return torch.utils.data.random_split(
+    return torch.utils.data.random_split(  # type: ignore
         base_dataset, [split_a_size, split_b_size], generator=torch.Generator().manual_seed(seed)
     )
diff --git a/lab1/text_recognizer/lit_models/base.py b/lab1/text_recognizer/lit_models/base.py
@@ -9,7 +9,23 @@
 ONE_CYCLE_TOTAL_STEPS = 100
 
 
-class BaseLitModel(pl.LightningModule):
+class Accuracy(pl.metrics.Accuracy):
+    """Accuracy Metric with a hack."""
+
+    def update(self, preds: torch.Tensor, target: torch.Tensor) -> None:
+        """
+        Metrics in Pytorch-lightning 1.2+ versions expect preds to be between 0 and 1 else fails with the ValueError:
+        "The `preds` should be probabilities, but values were detected outside of [0,1] range."
+        This is being tracked as a bug in https://github.com/PyTorchLightning/metrics/issues/60.
+        This method just hacks around it by normalizing preds before passing it in.
+        Normalized preds are not necessary for accuracy computation as we just care about argmax().
+        """
+        if preds.min() < 0 or preds.max() > 1:
+            preds = torch.nn.functional.softmax(preds, dim=-1)
+        super().update(preds=preds, target=target)
+
+
+class BaseLitModel(pl.LightningModule):  # pylint: disable=too-many-ancestors
     """
     Generic PyTorch-Lightning class that must be initialized with a PyTorch module.
     """
@@ -25,15 +41,15 @@ def __init__(self, model, args: argparse.Namespace = None):
         self.lr = self.args.get("lr", LR)
 
         loss = self.args.get("loss", LOSS)
-        if not loss in ("ctc", "transformer"):
+        if loss not in ("ctc", "transformer"):
             self.loss_fn = getattr(torch.nn.functional, loss)
 
         self.one_cycle_max_lr = self.args.get("one_cycle_max_lr", None)
         self.one_cycle_total_steps = self.args.get("one_cycle_total_steps", ONE_CYCLE_TOTAL_STEPS)
 
-        self.train_acc = pl.metrics.Accuracy()
-        self.val_acc = pl.metrics.Accuracy()
-        self.test_acc = pl.metrics.Accuracy()
+        self.train_acc = Accuracy()
+        self.val_acc = Accuracy()
+        self.test_acc = Accuracy()
 
     @staticmethod
     def add_to_argparse(parser):
@@ -48,7 +64,9 @@ def configure_optimizers(self):
         optimizer = self.optimizer_class(self.parameters(), lr=self.lr)
         if self.one_cycle_max_lr is None:
             return optimizer
-        scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer=optimizer, max_lr=self.one_cycle_max_lr, total_steps=self.one_cycle_total_steps)
+        scheduler = torch.optim.lr_scheduler.OneCycleLR(
+            optimizer=optimizer, max_lr=self.one_cycle_max_lr, total_steps=self.one_cycle_total_steps
+        )
         return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "val_loss"}
 
     def forward(self, x):

diff --git a/lab1/text_recognizer/util.py b/lab1/text_recognizer/util.py
@@ -1,15 +1,14 @@
 """Utility functions for text_recognizer module."""
-from concurrent.futures import as_completed, ThreadPoolExecutor
 from pathlib import Path
 from typing import Union
-from urllib.request import urlopen, urlretrieve
+from urllib.request import urlretrieve
+
+# import base64
 import hashlib
-import os
 
 from PIL import Image
 from tqdm import tqdm
 import numpy as np
-import cv2
 
 
 def to_categorical(y, num_classes):
@@ -26,34 +25,6 @@ def read_image_pil(image_uri: Union[Path, str], grayscale=False) -> Image:
         return image
 
 
-def read_image(image_uri: Union[Path, str], grayscale=False) -> np.array:
-    """Read image_uri."""
-
-    def read_image_from_filename(image_filename, imread_flag):
-        return cv2.imread(str(image_filename), imread_flag)
-
-    def read_image_from_url(image_url, imread_flag):
-        url_response = urlopen(str(image_url))  # nosec
-        img_array = np.array(bytearray(url_response.read()), dtype=np.uint8)
-        return cv2.imdecode(img_array, imread_flag)
-
-    imread_flag = cv2.IMREAD_GRAYSCALE if grayscale else cv2.IMREAD_COLOR
-    local_file = os.path.exists(image_uri)
-    try:
-        img = None
-        if local_file:
-            img = read_image_from_filename(image_uri, imread_flag)
-        else:
-            img = read_image_from_url(image_uri, imread_flag)
-        assert img is not None
-    except Exception as e:
-        raise ValueError("Could not load image at {}: {}".format(image_uri, e))
-    return img
-
-
-def write_image(image: np.ndarray, filename: Union[Path, str]) -> None:
-    """Write image to file."""
-    cv2.imwrite(str(filename), image)
 
 
 def compute_sha256(filename: Union[Path, str]):
@@ -69,11 +40,11 @@ def update_to(self, blocks=1, bsize=1, tsize=None):
         """
         Parameters
         ----------
-        blocks : int, optional
+        blocks: int, optional
             Number of blocks transferred so far [default: 1].
-        bsize  : int, optional
+        bsize: int, optional
             Size of each block (in tqdm units) [default: 1].
-        tsize  : int, optional
+        tsize: int, optional
             Total size (in tqdm units). If [default: None] remains unchanged.
         """
         if tsize is not None:
@@ -85,5 +56,3 @@ def download_url(url, filename):
     """Download a file from url to filename, with a progress bar."""
     with TqdmUpTo(unit="B", unit_scale=True, unit_divisor=1024, miniters=1) as t:
         urlretrieve(url, filename, reporthook=t.update_to, data=None)  # nosec
-
-
diff --git a/lab1/training/run_experiment.py b/lab1/training/run_experiment.py
@@ -72,7 +72,7 @@ def main():
     data = data_class(args)
     model = model_class(data_config=data.config(), args=args)
 
-    if args.loss not in ('ctc', 'transformer'):
+    if args.loss not in ("ctc", "transformer"):
         lit_model_class = lit_models.BaseLitModel
 
     if args.load_checkpoint is not None:
@@ -84,19 +84,19 @@ def main():
 
     early_stopping_callback = pl.callbacks.EarlyStopping(monitor="val_loss", mode="min", patience=10)
     model_checkpoint_callback = pl.callbacks.ModelCheckpoint(
-        filename='{epoch:03d}-{val_loss:.3f}-{val_cer:.3f}',
-        monitor="val_loss",
-        mode="min"
+        filename="{epoch:03d}-{val_loss:.3f}-{val_cer:.3f}", monitor="val_loss", mode="min"
     )
     callbacks = [early_stopping_callback, model_checkpoint_callback]
 
     args.weights_summary = "full"  # Print full summary of the model
     trainer = pl.Trainer.from_argparse_args(args, callbacks=callbacks, logger=logger, weights_save_path="training/logs")
 
+    # pylint: disable=no-member
     trainer.tune(lit_model, datamodule=data)  # If passing --auto_lr_find, this will set learning rate
 
     trainer.fit(lit_model, datamodule=data)
     trainer.test(lit_model, datamodule=data)
+    # pylint: enable=no-member
 
 
 

diff --git a/lab2/text_recognizer/data/__init__.py b/lab2/text_recognizer/data/__init__.py
@@ -5,6 +5,8 @@
 # Hide lines below until Lab 2
 from .emnist import EMNIST
 from .emnist_lines import EMNISTLines
+
 # Hide lines above until Lab 2
 
 
+